Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor mygrad.nnet.layers #399

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/mygrad/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from mygrad.math.sequential.funcs import *
from mygrad.math.sequential.funcs import max, min
from mygrad.math.trigonometric.funcs import *
from mygrad.nnet.layers.utils import sliding_window_view
from mygrad.nnet.layers.operations.utils import sliding_window_view
from mygrad.no_grad_funcs import *
from mygrad.tensor_creation.funcs import *
from mygrad.tensor_manip.array_shape.funcs import *
Expand Down
13 changes: 0 additions & 13 deletions src/mygrad/nnet/layers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +0,0 @@
from .batchnorm import batchnorm
from .conv import conv_nd
from .pooling import max_pool

__all__ = ["conv_nd", "max_pool", "batchnorm"]


try:
from .gru import gru

__all__ += ["gru"]
except ImportError: # pragma: no cover
pass
197 changes: 42 additions & 155 deletions src/mygrad/nnet/layers/batchnorm.py
Original file line number Diff line number Diff line change
@@ -1,173 +1,60 @@
from typing import Optional, Tuple, Union

import numpy as np

from mygrad import Tensor
from mygrad.operation_base import Operation
from mygrad.typing import ArrayLike

__all__ = ["batchnorm"]
from mygrad.tensor_creation.funcs import ones, zeros
from mygrad.nnet.layers.operations.batchnorm import batchnorm as batchnorm_op


# TODO: Remove affine parameters from Operation
class BatchNorm(Operation):
"""
Attributes
----------
mean : numpy.ndarray
var : numpy.ndarray
class BatchNorm:
""" A batch normalization layer.

Notes
-----
`mean` and `var` are bound as instance-attributes upon
calling the batch-norm instance.
This class will perform an n-dimensional batch normalization operation on an
(N, D, ...)-shaped tensor scaled by γ of shape (D, ...) and shifted by β of shape (D, ...).
"""

def __call__(self, x, gamma, beta, *, eps):
"""
y(x) = (x - E[x]) / sqrt(Var[x} + eps)
batchnorm(x) = gamma * y(x) + beta
def __init__(self, input_channels: int, momentum: float = 0.1):
""" Initialize a batch normalization layer.

Parameters
----------
x : mygrad.Tensor
gamma : Optional[mygrad.Tensor]
beta : Optional[mygrad.Tensor]
eps : Real
A small non-negative number.

Returns
-------
numpy.ndarray
input_channels : int
The number of channels of the data to be batch-normalized.
momentum : float, optional (default=0.1)
The momentum value used to maintain moving averages.
"""
normed_dims = tuple(i for i in range(x.ndim) if i != 1)
keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))

self.variables = tuple(i for i in (x, gamma, beta))

if gamma.size == 0:
gamma = None
if beta.size == 0:
beta = None
self.gamma = ones((1, input_channels), dtype=np.float32)
self.beta = zeros((1, input_channels), dtype=np.float32)
self.moving_mean = np.zeros((1, input_channels), dtype=np.float32)
self.moving_variance = np.zeros((1, input_channels), dtype=np.float32)
self.momentum = momentum
self.input_channels = input_channels

self.gamma = gamma
self.beta = beta
def __call__(self, x: Tensor, test: bool = False) -> Tensor:
""" Perform the forward-pass of n-dimensional batch normalization over axis 1 on `x`.

x = x.data
self.x_norm = None # required for backprop through gamma
self.mean = x.mean(axis=normed_dims)
self.var = x.var(axis=normed_dims)

if eps:
self.var += eps

y = x - self.mean.reshape(keepdims_shape)
self._std = np.sqrt(self.var).reshape(keepdims_shape) # sqrt(var + eps)
y /= self._std
self.x_norm = y
# optional affine transformation
if gamma is not None:
gamma = gamma.data
# must copy `y` to prevent mutation of `self.x_norm`
y = y * gamma.reshape(keepdims_shape)

if beta is not None:
beta = beta.data
y = y + beta.reshape(keepdims_shape)
return y

def backward_var(self, grad, index, **kwargs):
x = self.variables[0].data
if index == 0: # backprop through x
normed_dims = tuple(i for i in range(x.ndim) if i != 1)
Parameters
----------
x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, ...)
The data to normalize.
test : boolean, optional (default=False)
Determines whether the layer is being used at training time. The mean and variance
will be computed for the batch during training, while averaged batch statistics will
be used at test time.
"""
if test:
# use the averaged batch statistics from training rather than computing them on a test batch
keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
N = x.size / x.shape[1]
x = x - self.moving_mean.reshape(keepdims_shape)
x /= np.sqrt(self.moving_variance.reshape(keepdims_shape) + 1e-08)
return self.gamma * x + self.beta

# all sums carried over non-channel dims
# (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)]
grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True)
x_norm = batchnorm_op(x, gamma=self.gamma, beta=self.beta, eps=1e-08)

rterm = self.x_norm * np.reshape(
np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]),
keepdims_shape,
)
rterm /= N
grad_ -= rterm
grad_ /= self._std
if (
self.gamma is not None
): # backprop through optional affine transformation
gamma = self.gamma.data
grad_ *= gamma.reshape(keepdims_shape)
return grad_
batch_mean = x_norm.creator.mean
batch_variance = x_norm.creator.var

elif index == 1 and self.gamma is not None: # backprop through gamma
return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1])

elif (index == 1 and self.gamma is None) or index == 2:
normed_dims = tuple(i for i in range(x.ndim) if i != 1)
return grad.sum(axis=normed_dims)
else: # pragma: no cover
raise IndexError


def batchnorm(
x: ArrayLike,
*,
gamma: Optional[ArrayLike] = None,
beta: Optional[ArrayLike] = None,
eps: float,
constant: Optional[bool] = None
) -> Tensor:
"""
Performs batch normalization on ``x``::

y(x) = (x - E[x]) / sqrt(Var[x] + eps)
batchnorm(x) = gamma * y(x) + beta

Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively,
over axis-1 of ``x``. The subsequent affine transformation on ``y``
is optional.

Parameters
----------
x : array_like, shape=(N, C, ...)
The batch to be normalized within each entry of C

gamma : Optional[array_like], shape=(C,)
Optional per-channel scaling factors to be applied after the
normalization step.

beta : Optional[array_like], shape=(C,)
Optional per-channel scaling bias factors to be applied after the
normalization step.

eps : Real
A small non-negative number.

constant : bool, optional (default=False)
If True, the resulting Tensor is a constant.

Returns
-------
mygrad.Tensor
The batch-normalized data.

Examples
--------
>>> import mygrad as mg
>>> from mygrad.nnet import batchnorm
>>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1)
>>> batchnorm(x, eps=0)
Tensor([[-0.70710678],
[ 1.41421356],
[-0.70710678]])
"""
# pass gamma and beta as empty arrays if they are not supplied
if gamma is None:
gamma = np.array([])
if beta is None:
beta = np.array([])
return Tensor._op(
BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant
)
self.moving_mean *= 1 - self.momentum
self.moving_mean += self.momentum * batch_mean
self.moving_variance *= 1 - self.momentum
self.moving_variance += self.momentum * batch_variance
return x_norm
13 changes: 13 additions & 0 deletions src/mygrad/nnet/layers/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .batchnorm import batchnorm
from .conv import conv_nd
from .pooling import max_pool

__all__ = ["conv_nd", "max_pool", "batchnorm"]


try:
from .gru import gru

__all__ += ["gru"]
except ImportError: # pragma: no cover
pass
Loading