diff --git a/src/mygrad/__init__.py b/src/mygrad/__init__.py
index fec278c2..dcda5549 100644
--- a/src/mygrad/__init__.py
+++ b/src/mygrad/__init__.py
@@ -24,7 +24,7 @@
 from mygrad.math.sequential.funcs import *
 from mygrad.math.sequential.funcs import max, min
 from mygrad.math.trigonometric.funcs import *
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 from mygrad.no_grad_funcs import *
 from mygrad.tensor_creation.funcs import *
 from mygrad.tensor_manip.array_shape.funcs import *
diff --git a/src/mygrad/nnet/layers/__init__.py b/src/mygrad/nnet/layers/__init__.py
index c8e0b485..e69de29b 100644
--- a/src/mygrad/nnet/layers/__init__.py
+++ b/src/mygrad/nnet/layers/__init__.py
@@ -1,13 +0,0 @@
-from .batchnorm import batchnorm
-from .conv import conv_nd
-from .pooling import max_pool
-
-__all__ = ["conv_nd", "max_pool", "batchnorm"]
-
-
-try:
-    from .gru import gru
-
-    __all__ += ["gru"]
-except ImportError:  # pragma: no cover
-    pass
diff --git a/src/mygrad/nnet/layers/batchnorm.py b/src/mygrad/nnet/layers/batchnorm.py
index f98d35bf..08044dc3 100644
--- a/src/mygrad/nnet/layers/batchnorm.py
+++ b/src/mygrad/nnet/layers/batchnorm.py
@@ -1,173 +1,60 @@
-from typing import Optional, Tuple, Union
-
 import numpy as np
 
 from mygrad import Tensor
-from mygrad.operation_base import Operation
-from mygrad.typing import ArrayLike
-
-__all__ = ["batchnorm"]
+from mygrad.tensor_creation.funcs import ones, zeros
+from mygrad.nnet.layers.operations.batchnorm import batchnorm as batchnorm_op
 
 
-# TODO: Remove affine parameters from Operation
-class BatchNorm(Operation):
-    """
-    Attributes
-    ----------
-    mean : numpy.ndarray
-    var : numpy.ndarray
+class BatchNorm:
+    """ A batch normalization layer.
 
-    Notes
-    -----
-    `mean` and `var` are bound as instance-attributes upon
-    calling the batch-norm instance.
+    This class will perform an n-dimensional batch normalization operation on an
+    (N, D, ...)-shaped tensor scaled by γ of shape (D, ...) and shifted by β of shape (D, ...).
     """
 
-    def __call__(self, x, gamma, beta, *, eps):
-        """
-        y(x) = (x - E[x]) / sqrt(Var[x} + eps)
-        batchnorm(x) = gamma * y(x) + beta
+    def __init__(self, input_channels: int, momentum: float = 0.1):
+        """ Initialize a batch normalization layer.
 
         Parameters
         ----------
-        x : mygrad.Tensor
-        gamma : Optional[mygrad.Tensor]
-        beta : Optional[mygrad.Tensor]
-        eps : Real
-           A small non-negative number.
-
-        Returns
-        -------
-        numpy.ndarray
+        input_channels : int
+            The number of channels of the data to be batch-normalized.
+        momentum : float, optional (default=0.1)
+            The momentum value used to maintain moving averages.
         """
-        normed_dims = tuple(i for i in range(x.ndim) if i != 1)
-        keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
-
-        self.variables = tuple(i for i in (x, gamma, beta))
-
-        if gamma.size == 0:
-            gamma = None
-        if beta.size == 0:
-            beta = None
+        self.gamma = ones((1, input_channels), dtype=np.float32)
+        self.beta = zeros((1, input_channels), dtype=np.float32)
+        self.moving_mean = np.zeros((1, input_channels), dtype=np.float32)
+        self.moving_variance = np.zeros((1, input_channels), dtype=np.float32)
+        self.momentum = momentum
+        self.input_channels = input_channels
 
-        self.gamma = gamma
-        self.beta = beta
+    def __call__(self, x: Tensor, test: bool = False) -> Tensor:
+        """ Perform the forward-pass of n-dimensional batch normalization over axis 1 on `x`.
 
-        x = x.data
-        self.x_norm = None  # required for backprop through gamma
-        self.mean = x.mean(axis=normed_dims)
-        self.var = x.var(axis=normed_dims)
-
-        if eps:
-            self.var += eps
-
-        y = x - self.mean.reshape(keepdims_shape)
-        self._std = np.sqrt(self.var).reshape(keepdims_shape)  # sqrt(var + eps)
-        y /= self._std
-        self.x_norm = y
-        # optional affine transformation
-        if gamma is not None:
-            gamma = gamma.data
-            # must copy `y` to prevent mutation of `self.x_norm`
-            y = y * gamma.reshape(keepdims_shape)
-
-        if beta is not None:
-            beta = beta.data
-            y = y + beta.reshape(keepdims_shape)
-        return y
-
-    def backward_var(self, grad, index, **kwargs):
-        x = self.variables[0].data
-        if index == 0:  # backprop through x
-            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
+        Parameters
+        ----------
+        x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, ...)
+            The data to normalize.
+        test : boolean, optional (default=False)
+            Determines whether the layer is being used at training time. The mean and variance
+            will be computed for the batch during training, while averaged batch statistics will
+            be used at test time.
+        """
+        if test:
+            # use the averaged batch statistics from training rather than computing them on a test batch
             keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
-            N = x.size / x.shape[1]
+            x = x - self.moving_mean.reshape(keepdims_shape)
+            x /= np.sqrt(self.moving_variance.reshape(keepdims_shape) + 1e-08)
+            return self.gamma * x + self.beta
 
-            # all sums carried over non-channel dims
-            # (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)]
-            grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True)
+        x_norm = batchnorm_op(x, gamma=self.gamma, beta=self.beta, eps=1e-08)
 
-            rterm = self.x_norm * np.reshape(
-                np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]),
-                keepdims_shape,
-            )
-            rterm /= N
-            grad_ -= rterm
-            grad_ /= self._std
-            if (
-                self.gamma is not None
-            ):  # backprop through optional affine transformation
-                gamma = self.gamma.data
-                grad_ *= gamma.reshape(keepdims_shape)
-            return grad_
+        batch_mean = x_norm.creator.mean
+        batch_variance = x_norm.creator.var
 
-        elif index == 1 and self.gamma is not None:  # backprop through gamma
-            return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1])
-
-        elif (index == 1 and self.gamma is None) or index == 2:
-            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
-            return grad.sum(axis=normed_dims)
-        else:  # pragma: no cover
-            raise IndexError
-
-
-def batchnorm(
-    x: ArrayLike,
-    *,
-    gamma: Optional[ArrayLike] = None,
-    beta: Optional[ArrayLike] = None,
-    eps: float,
-    constant: Optional[bool] = None
-) -> Tensor:
-    """
-    Performs batch normalization on ``x``::
-
-                 y(x) = (x - E[x]) / sqrt(Var[x] + eps)
-                 batchnorm(x) = gamma * y(x) + beta
-
-    Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively,
-    over axis-1 of ``x``. The subsequent affine transformation on ``y``
-    is optional.
-
-    Parameters
-    ----------
-    x : array_like, shape=(N, C, ...)
-        The batch to be normalized within each entry of C
-
-    gamma : Optional[array_like], shape=(C,)
-        Optional per-channel scaling factors to be applied after the
-        normalization step.
-
-    beta  : Optional[array_like], shape=(C,)
-        Optional per-channel scaling bias factors to be applied after the
-        normalization step.
-
-    eps : Real
-       A small non-negative number.
-
-    constant : bool, optional (default=False)
-        If True, the resulting Tensor is a constant.
-
-    Returns
-    -------
-    mygrad.Tensor
-        The batch-normalized data.
-
-    Examples
-    --------
-    >>> import mygrad as mg
-    >>> from mygrad.nnet import batchnorm
-    >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1)
-    >>> batchnorm(x, eps=0)
-    Tensor([[-0.70710678],
-            [ 1.41421356],
-            [-0.70710678]])
-    """
-    # pass gamma and beta as empty arrays if they are not supplied
-    if gamma is None:
-        gamma = np.array([])
-    if beta is None:
-        beta = np.array([])
-    return Tensor._op(
-        BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant
-    )
+        self.moving_mean *= 1 - self.momentum
+        self.moving_mean += self.momentum * batch_mean
+        self.moving_variance *= 1 - self.momentum
+        self.moving_variance += self.momentum * batch_variance
+        return x_norm
diff --git a/src/mygrad/nnet/layers/operations/__init__.py b/src/mygrad/nnet/layers/operations/__init__.py
new file mode 100644
index 00000000..c8e0b485
--- /dev/null
+++ b/src/mygrad/nnet/layers/operations/__init__.py
@@ -0,0 +1,13 @@
+from .batchnorm import batchnorm
+from .conv import conv_nd
+from .pooling import max_pool
+
+__all__ = ["conv_nd", "max_pool", "batchnorm"]
+
+
+try:
+    from .gru import gru
+
+    __all__ += ["gru"]
+except ImportError:  # pragma: no cover
+    pass
diff --git a/src/mygrad/nnet/layers/operations/batchnorm.py b/src/mygrad/nnet/layers/operations/batchnorm.py
new file mode 100644
index 00000000..f98d35bf
--- /dev/null
+++ b/src/mygrad/nnet/layers/operations/batchnorm.py
@@ -0,0 +1,173 @@
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from mygrad import Tensor
+from mygrad.operation_base import Operation
+from mygrad.typing import ArrayLike
+
+__all__ = ["batchnorm"]
+
+
+# TODO: Remove affine parameters from Operation
+class BatchNorm(Operation):
+    """
+    Attributes
+    ----------
+    mean : numpy.ndarray
+    var : numpy.ndarray
+
+    Notes
+    -----
+    `mean` and `var` are bound as instance-attributes upon
+    calling the batch-norm instance.
+    """
+
+    def __call__(self, x, gamma, beta, *, eps):
+        """
+        y(x) = (x - E[x]) / sqrt(Var[x} + eps)
+        batchnorm(x) = gamma * y(x) + beta
+
+        Parameters
+        ----------
+        x : mygrad.Tensor
+        gamma : Optional[mygrad.Tensor]
+        beta : Optional[mygrad.Tensor]
+        eps : Real
+           A small non-negative number.
+
+        Returns
+        -------
+        numpy.ndarray
+        """
+        normed_dims = tuple(i for i in range(x.ndim) if i != 1)
+        keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
+
+        self.variables = tuple(i for i in (x, gamma, beta))
+
+        if gamma.size == 0:
+            gamma = None
+        if beta.size == 0:
+            beta = None
+
+        self.gamma = gamma
+        self.beta = beta
+
+        x = x.data
+        self.x_norm = None  # required for backprop through gamma
+        self.mean = x.mean(axis=normed_dims)
+        self.var = x.var(axis=normed_dims)
+
+        if eps:
+            self.var += eps
+
+        y = x - self.mean.reshape(keepdims_shape)
+        self._std = np.sqrt(self.var).reshape(keepdims_shape)  # sqrt(var + eps)
+        y /= self._std
+        self.x_norm = y
+        # optional affine transformation
+        if gamma is not None:
+            gamma = gamma.data
+            # must copy `y` to prevent mutation of `self.x_norm`
+            y = y * gamma.reshape(keepdims_shape)
+
+        if beta is not None:
+            beta = beta.data
+            y = y + beta.reshape(keepdims_shape)
+        return y
+
+    def backward_var(self, grad, index, **kwargs):
+        x = self.variables[0].data
+        if index == 0:  # backprop through x
+            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
+            keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
+            N = x.size / x.shape[1]
+
+            # all sums carried over non-channel dims
+            # (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)]
+            grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True)
+
+            rterm = self.x_norm * np.reshape(
+                np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]),
+                keepdims_shape,
+            )
+            rterm /= N
+            grad_ -= rterm
+            grad_ /= self._std
+            if (
+                self.gamma is not None
+            ):  # backprop through optional affine transformation
+                gamma = self.gamma.data
+                grad_ *= gamma.reshape(keepdims_shape)
+            return grad_
+
+        elif index == 1 and self.gamma is not None:  # backprop through gamma
+            return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1])
+
+        elif (index == 1 and self.gamma is None) or index == 2:
+            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
+            return grad.sum(axis=normed_dims)
+        else:  # pragma: no cover
+            raise IndexError
+
+
+def batchnorm(
+    x: ArrayLike,
+    *,
+    gamma: Optional[ArrayLike] = None,
+    beta: Optional[ArrayLike] = None,
+    eps: float,
+    constant: Optional[bool] = None
+) -> Tensor:
+    """
+    Performs batch normalization on ``x``::
+
+                 y(x) = (x - E[x]) / sqrt(Var[x] + eps)
+                 batchnorm(x) = gamma * y(x) + beta
+
+    Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively,
+    over axis-1 of ``x``. The subsequent affine transformation on ``y``
+    is optional.
+
+    Parameters
+    ----------
+    x : array_like, shape=(N, C, ...)
+        The batch to be normalized within each entry of C
+
+    gamma : Optional[array_like], shape=(C,)
+        Optional per-channel scaling factors to be applied after the
+        normalization step.
+
+    beta  : Optional[array_like], shape=(C,)
+        Optional per-channel scaling bias factors to be applied after the
+        normalization step.
+
+    eps : Real
+       A small non-negative number.
+
+    constant : bool, optional (default=False)
+        If True, the resulting Tensor is a constant.
+
+    Returns
+    -------
+    mygrad.Tensor
+        The batch-normalized data.
+
+    Examples
+    --------
+    >>> import mygrad as mg
+    >>> from mygrad.nnet import batchnorm
+    >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1)
+    >>> batchnorm(x, eps=0)
+    Tensor([[-0.70710678],
+            [ 1.41421356],
+            [-0.70710678]])
+    """
+    # pass gamma and beta as empty arrays if they are not supplied
+    if gamma is None:
+        gamma = np.array([])
+    if beta is None:
+        beta = np.array([])
+    return Tensor._op(
+        BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant
+    )
diff --git a/src/mygrad/nnet/layers/conv.py b/src/mygrad/nnet/layers/operations/conv.py
similarity index 99%
rename from src/mygrad/nnet/layers/conv.py
rename to src/mygrad/nnet/layers/operations/conv.py
index aa668a9f..c83bf8b4 100644
--- a/src/mygrad/nnet/layers/conv.py
+++ b/src/mygrad/nnet/layers/operations/conv.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 from mygrad.operation_base import Operation
 from mygrad.tensor_base import Tensor
 from mygrad.typing import ArrayLike
diff --git a/src/mygrad/nnet/layers/gru.py b/src/mygrad/nnet/layers/operations/gru.py
similarity index 100%
rename from src/mygrad/nnet/layers/gru.py
rename to src/mygrad/nnet/layers/operations/gru.py
diff --git a/src/mygrad/nnet/layers/pooling.py b/src/mygrad/nnet/layers/operations/pooling.py
similarity index 99%
rename from src/mygrad/nnet/layers/pooling.py
rename to src/mygrad/nnet/layers/operations/pooling.py
index 439f9df3..3d1f07ab 100644
--- a/src/mygrad/nnet/layers/pooling.py
+++ b/src/mygrad/nnet/layers/operations/pooling.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 from mygrad.operation_base import Operation
 from mygrad.tensor_base import Tensor
 from mygrad.typing import ArrayLike
diff --git a/src/mygrad/nnet/layers/utils.py b/src/mygrad/nnet/layers/operations/utils.py
similarity index 100%
rename from src/mygrad/nnet/layers/utils.py
rename to src/mygrad/nnet/layers/operations/utils.py
diff --git a/src/mygrad/tensor_manip/tiling/ops.py b/src/mygrad/tensor_manip/tiling/ops.py
index 3b27d91a..82ce749b 100644
--- a/src/mygrad/tensor_manip/tiling/ops.py
+++ b/src/mygrad/tensor_manip/tiling/ops.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 from mygrad.operation_base import Operation
 from mygrad.tensor_base import Tensor
 
diff --git a/tests/nnet/initializers/test_dirac.py b/tests/nnet/initializers/test_dirac.py
index c8ce8df7..270ee960 100644
--- a/tests/nnet/initializers/test_dirac.py
+++ b/tests/nnet/initializers/test_dirac.py
@@ -6,7 +6,7 @@
 
 from mygrad import Tensor
 from mygrad.nnet.initializers import dirac
-from mygrad.nnet.layers.conv import conv_nd
+from mygrad.nnet.layers.operations.conv import conv_nd
 
 
 @given(shape=hnp.array_shapes(max_dims=1))
diff --git a/tests/nnet/layers/test_batchnorm.py b/tests/nnet/layers/test_batchnorm.py
index 5953acaf..d7987912 100644
--- a/tests/nnet/layers/test_batchnorm.py
+++ b/tests/nnet/layers/test_batchnorm.py
@@ -6,7 +6,7 @@
 
 import mygrad as mg
 from mygrad import Tensor
-from mygrad.nnet.layers.batchnorm import batchnorm
+from mygrad.nnet.layers.operations.batchnorm import batchnorm
 from tests.wrappers.uber import backprop_test_factory, fwdprop_test_factory
 
 
diff --git a/tests/nnet/layers/test_conv.py b/tests/nnet/layers/test_conv.py
index 2b1d01b2..8173e9cf 100644
--- a/tests/nnet/layers/test_conv.py
+++ b/tests/nnet/layers/test_conv.py
@@ -11,7 +11,7 @@
 
 import mygrad as mg
 from mygrad import Tensor
-from mygrad.nnet.layers import conv_nd
+from mygrad.nnet.layers.operations import conv_nd
 
 from ...utils.numerical_gradient import numerical_gradient_full
 from ...wrappers.uber import backprop_test_factory, fwdprop_test_factory
diff --git a/tests/nnet/layers/test_gru.py b/tests/nnet/layers/test_gru.py
index e34fb555..c2be41f2 100644
--- a/tests/nnet/layers/test_gru.py
+++ b/tests/nnet/layers/test_gru.py
@@ -13,7 +13,7 @@
 from mygrad.tensor_base import Tensor
 from tests.utils.errors import does_not_raise
 
-gru_module = pytest.importorskip("mygrad.nnet.layers.gru")
+gru_module = pytest.importorskip("mygrad.nnet.layers.operations.gru")
 gru = gru_module.gru
 
 
diff --git a/tests/nnet/layers/test_maxpool.py b/tests/nnet/layers/test_maxpool.py
index b106f601..2df7c7d5 100644
--- a/tests/nnet/layers/test_maxpool.py
+++ b/tests/nnet/layers/test_maxpool.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_allclose
 from pytest import raises
 
-from mygrad.nnet.layers import max_pool
+from mygrad.nnet.layers.operations import max_pool
 from mygrad.tensor_base import Tensor
 
 
diff --git a/tests/nnet/test_sliding_window.py b/tests/nnet/test_sliding_window.py
index 219b298b..5928da37 100644
--- a/tests/nnet/test_sliding_window.py
+++ b/tests/nnet/test_sliding_window.py
@@ -5,7 +5,7 @@
 from hypothesis import given, settings
 from numpy.testing import assert_allclose
 
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 
 dtype_strat_numpy = st.sampled_from(
     (np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64)