From 039638d9364fd3a01eee5c2017612db4d1a39425 Mon Sep 17 00:00:00 2001
From: Milan Curcic <caomaco@gmail.com>
Date: Fri, 21 Feb 2025 12:38:53 -0500
Subject: [PATCH] Dropout layer (#194)

* First stab at dropout; conflict with base type TODO

* Partial dropout integration

* Test uninitialized dropout layer

* Test dropout state that follows an input layer

* Enable forward pass for dropout; backward pass TODO

* Version bump and add dropout to the features table

* Add dropout to CMake

* Enable preprocessing in fpm.toml (needed with recent versions of fpm)

* Small change in scale implementation

* Integration of backward pass for dropout

* Reduce tolerance in conv2d convergence tests

* Fix bug in dropout scaling

Co-authored-by: Ricardo Orsi <@ricor07>

* disable dropout in inference mode (net % predict); TODO enable in net % train

* Set dropout's training mode to true in net % train(); add tests

* WIP dropout tests

* Dropout layers always in training mode; except when  is called, when they are in inference mode

* Update the layers table

* Ensure the actual dropout rate == requested dropout rate in most cases

* Accumulate the gradient in dropout % backward and flush in network % update

* Guard against bad dropout rate

* Connect the backward pass; expand tests

* Expand tests

* Use the reference scaling in dropout; don't accumulate gradients because it's not needed

* Add dropout to MNIST example; small model changes

* Add reference

* Update print_info dropout

* Update print_info

* Compute scale once in dropout constructor

* dropout % backward() doesn't need input from the previous layer

* Timing info of dropout

---------

Co-authored-by: Vandenplas, Jeremie <jeremie.vandenplas@wur.nl>
---
 CMakeLists.txt                             |   2 +
 README.md                                  |   5 +-
 example/dense_mnist.f90                    |   9 +-
 src/nf.f90                                 |   2 +-
 src/nf/nf_dropout_layer.f90                |  83 +++++++
 src/nf/nf_dropout_layer_submodule.f90      |  68 ++++++
 src/nf/nf_layer.f90                        |   2 +-
 src/nf/nf_layer_constructors.f90           |  20 +-
 src/nf/nf_layer_constructors_submodule.f90 |  12 +-
 src/nf/nf_layer_submodule.f90              |  53 ++++-
 src/nf/nf_network.f90                      |  10 +
 src/nf/nf_network_submodule.f90            |  76 ++++++-
 src/nf/nf_random.f90                       |  24 +-
 test/CMakeLists.txt                        |   1 +
 test/test_conv2d_network.f90               |   6 +-
 test/test_dropout_layer.f90                | 243 +++++++++++++++++++++
 16 files changed, 582 insertions(+), 34 deletions(-)
 create mode 100644 src/nf/nf_dropout_layer.f90
 create mode 100644 src/nf/nf_dropout_layer_submodule.f90
 create mode 100644 test/test_dropout_layer.f90

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc2ddfcb..eda96b28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,8 @@ add_library(neural-fortran
   src/nf/nf_reshape_layer_submodule.f90
   src/nf/io/nf_io_binary.f90
   src/nf/io/nf_io_binary_submodule.f90
+  src/nf/nf_dropout_layer.f90
+  src/nf/nf_dropout_layer_submodule.f90
 )
 
 target_link_libraries(neural-fortran PRIVATE)
diff --git a/README.md b/README.md
index ebf7704d..a0eee745 100644
--- a/README.md
+++ b/README.md
@@ -30,11 +30,12 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
-| Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ |
+| Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
+| Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `linear2d` | 2 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
 (*) See Issue [#145](https://github.com/modern-fortran/neural-fortran/issues/145) regarding non-converging CNN training on the MNIST dataset.
diff --git a/example/dense_mnist.f90 b/example/dense_mnist.f90
index c26d0ced..c1db2da4 100644
--- a/example/dense_mnist.f90
+++ b/example/dense_mnist.f90
@@ -1,6 +1,6 @@
 program dense_mnist
 
-  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr
+  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr, relu, softmax, dropout
 
   implicit none
 
@@ -17,8 +17,9 @@ program dense_mnist
 
   net = network([ &
     input(784), &
-    dense(30), &
-    dense(10) &
+    dense(64, relu()), &
+    dropout(0.2), &
+    dense(10, softmax()) &
   ])
   num_epochs = 10
 
@@ -32,7 +33,7 @@ program dense_mnist
     call net % train( &
       training_images, &
       label_digits(training_labels), &
-      batch_size=100, &
+      batch_size=128, &
       epochs=1, &
       optimizer=sgd(learning_rate=3.) &
     )
diff --git a/src/nf.f90 b/src/nf.f90
index e9b027c1..7a989ea3 100644
--- a/src/nf.f90
+++ b/src/nf.f90
@@ -3,7 +3,7 @@ module nf
   use nf_datasets_mnist, only: label_digits, load_mnist
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
-    conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+    conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network
diff --git a/src/nf/nf_dropout_layer.f90 b/src/nf/nf_dropout_layer.f90
new file mode 100644
index 00000000..f7165aa0
--- /dev/null
+++ b/src/nf/nf_dropout_layer.f90
@@ -0,0 +1,83 @@
+module nf_dropout_layer
+
+  !! Dropout layer by Srivastava et al. (2014).
+  !!
+  !! Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I. and 
+  !! Salakhutdinov, R., 2014. Dropout: a simple way to prevent neural networks 
+  !! from overfitting. The Journal of Machine Learning Research, 16(1), 
+  !! pp.1929-1958.
+
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: dropout_layer
+
+  type, extends(base_layer) :: dropout_layer
+    !! Concrete implementation of a dropout layer type
+
+    integer :: input_size = 0
+
+    real, allocatable :: output(:)
+    real, allocatable :: gradient(:)
+    real, allocatable :: mask(:) ! binary mask for dropout
+
+    real :: dropout_rate ! probability of dropping a neuron
+    real :: scale ! scale factor to preserve the input sum
+    logical :: training = .true. ! set to .false. for inference
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: init
+
+  end type dropout_layer
+
+  interface dropout_layer
+    module function dropout_layer_cons(rate) &
+      result(res)
+      !! This function returns the `dropout_layer` instance.
+      real, intent(in) :: rate
+        !! Dropout rate
+      type(dropout_layer) :: res
+        !! dropout_layer instance
+    end function dropout_layer_cons
+  end interface dropout_layer
+
+  interface
+
+    pure module subroutine backward(self, gradient)
+      !! Apply the backward gradient descent pass.
+      !! Only weight and bias gradients are updated in this subroutine,
+      !! while the weights and biases themselves are untouched.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      real, intent(in) :: gradient(:)
+        !! Gradient from the next layer
+    end subroutine backward
+
+    module subroutine forward(self, input)
+      !! Propagate forward the layer.
+      !! Calling this subroutine updates the values of a few data components
+      !! of `dropout_layer` that are needed for the backward pass.
+      class(dropout_layer), intent(in out) :: self
+        !! Dense layer instance
+      real, intent(in) :: input(:)
+        !! Input from the previous layer
+    end subroutine forward
+
+    module subroutine init(self, input_shape)
+      !! Initialize the layer data structures.
+      !!
+      !! This is a deferred procedure from the `base_layer` abstract type.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      integer, intent(in) :: input_shape(:)
+        !! Shape of the input layer
+    end subroutine init
+
+  end interface
+
+end module nf_dropout_layer
diff --git a/src/nf/nf_dropout_layer_submodule.f90 b/src/nf/nf_dropout_layer_submodule.f90
new file mode 100644
index 00000000..3fe07b1a
--- /dev/null
+++ b/src/nf/nf_dropout_layer_submodule.f90
@@ -0,0 +1,68 @@
+submodule (nf_dropout_layer) nf_dropout_layer_submodule
+  use nf_random, only: shuffle
+  !! This submodule implements the procedures defined in the
+  !! nf_dropout_layer module.
+
+contains
+
+  module function dropout_layer_cons(rate) result(res)
+    real, intent(in) :: rate
+    type(dropout_layer) :: res
+    res % dropout_rate = rate
+    res % scale = 1 / (1 - rate)
+  end function dropout_layer_cons
+
+
+  module subroutine init(self, input_shape)
+    class(dropout_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % input_size = input_shape(1)
+
+    ! Allocate arrays
+    allocate(self % output(self % input_size))
+    allocate(self % gradient(self % input_size))
+    allocate(self % mask(self % input_size))
+
+    ! Initialize arrays
+    self % output = 0
+    self % gradient = 0
+    self % mask = 1  ! Default mask is all ones (no dropout)
+
+  end subroutine init
+
+
+  module subroutine forward(self, input)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: input(:)
+
+    ! Generate random mask for dropout, training mode only
+    if (self % training) then
+
+      ! Set the first dropout_rate number of elements to 0, the rest to 1,
+      ! and shuffle. Note that the selection of the elements rounds down to
+      ! the nearest integer, so in cases where size(input) * dropout_rate is
+      ! not an integer, the actual dropout rate will be slightly lower.
+      self % mask = 1
+      self % mask(:int(size(self % mask) * self % dropout_rate)) = 0
+      call shuffle(self % mask)
+
+      ! Apply dropout mask
+      self % output = input * self % mask * self % scale
+
+    else
+      ! In inference mode, we don't apply dropout; simply pass through the input
+      self % output = input
+
+    end if
+
+  end subroutine forward
+
+
+  pure module subroutine backward(self, gradient)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: gradient(:)
+    self % gradient = gradient * self % mask * self % scale
+  end subroutine backward
+
+end submodule nf_dropout_layer_submodule 
\ No newline at end of file
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
index 33d1c773..517622b0 100644
--- a/src/nf/nf_layer.f90
+++ b/src/nf/nf_layer.f90
@@ -91,7 +91,7 @@ end subroutine backward_3d
 
   interface
 
-    pure module subroutine forward(self, input)
+    module subroutine forward(self, input)
       !! Apply a forward pass on the layer.
       !! This changes the internal state of the layer.
       !! This is normally called internally by the `network % forward`
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
index 2983ddcd..87ceeeea 100644
--- a/src/nf/nf_layer_constructors.f90
+++ b/src/nf/nf_layer_constructors.f90
@@ -8,7 +8,7 @@ module nf_layer_constructors
   implicit none
 
   private
-  public :: conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+  public :: conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
 
   interface input
 
@@ -104,6 +104,24 @@ module function dense(layer_size, activation) result(res)
         !! Resulting layer instance
     end function dense
 
+    module function dropout(rate) result(res)
+      !! Create a dropout layer with a given dropout rate.
+      !!
+      !! This layer is for randomly disabling neurons during training.
+      !!
+      !! Example:
+      !!
+      !! ```
+      !! use nf, only :: dropout, layer
+      !! type(layer) :: dropout_layer
+      !! dropout_layer = dropout(rate=0.5)
+      !! ```
+      real, intent(in) :: rate
+        !! Dropout rate - fraction of neurons to randomly disable during training
+      type(layer) :: res
+        !! Resulting layer instance
+    end function dropout
+
     module function flatten() result(res)
       !! Flatten (3-d -> 1-d) layer constructor.
       !!
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
index ae7d05dc..9558a0bc 100644
--- a/src/nf/nf_layer_constructors_submodule.f90
+++ b/src/nf/nf_layer_constructors_submodule.f90
@@ -3,6 +3,7 @@
   use nf_layer, only: layer
   use nf_conv2d_layer, only: conv2d_layer
   use nf_dense_layer, only: dense_layer
+  use nf_dropout_layer, only: dropout_layer
   use nf_flatten_layer, only: flatten_layer
   use nf_input1d_layer, only: input1d_layer
   use nf_input2d_layer, only: input2d_layer
@@ -65,6 +66,16 @@ module function dense(layer_size, activation) result(res)
   end function dense
 
 
+  module function dropout(rate) result(res)
+    real, intent(in) :: rate
+    type(layer) :: res
+    if (rate < 0 .or. rate > 1) &
+      error stop 'rate must be between 0 and 1 in a dropout layer'
+    res % name = 'dropout'
+    allocate(res % p, source=dropout_layer(rate))
+  end function dropout
+
+
   module function flatten() result(res)
     type(layer) :: res
     res % name = 'flatten'
@@ -72,7 +83,6 @@ module function flatten() result(res)
   end function flatten
 
 
-
   module function input1d(layer_size) result(res)
     integer, intent(in) :: layer_size
     type(layer) :: res
diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90
index 22eabe9e..701dfe29 100644
--- a/src/nf/nf_layer_submodule.f90
+++ b/src/nf/nf_layer_submodule.f90
@@ -3,6 +3,7 @@
   use iso_fortran_env, only: stderr => error_unit
   use nf_conv2d_layer, only: conv2d_layer
   use nf_dense_layer, only: dense_layer
+  use nf_dropout_layer, only: dropout_layer
   use nf_flatten_layer, only: flatten_layer
   use nf_input1d_layer, only: input1d_layer
   use nf_input2d_layer, only: input2d_layer
@@ -26,16 +27,22 @@ pure module subroutine backward_1d(self, previous, gradient)
 
       type is(dense_layer)
 
-        ! Upstream layers permitted: input1d, dense, flatten
+        ! Upstream layers permitted: input1d, dense, dropout, flatten
         select type(prev_layer => previous % p)
           type is(input1d_layer)
             call this_layer % backward(prev_layer % output, gradient)
           type is(dense_layer)
             call this_layer % backward(prev_layer % output, gradient)
+          type is(dropout_layer)
+            call this_layer % backward(prev_layer % output, gradient)
           type is(flatten_layer)
             call this_layer % backward(prev_layer % output, gradient)
         end select
 
+      type is(dropout_layer)
+        ! Upstream layers permitted: input1d, dense, dropout, flatten
+        call this_layer % backward(gradient)
+
       type is(flatten_layer)
 
         ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d
@@ -134,7 +141,7 @@ pure module subroutine backward_3d(self, previous, gradient)
   end subroutine backward_3d
 
 
-  pure module subroutine forward(self, input)
+  module subroutine forward(self, input)
     implicit none
     class(layer), intent(in out) :: self
     class(layer), intent(in) :: input
@@ -143,6 +150,20 @@ pure module subroutine forward(self, input)
 
       type is(dense_layer)
 
+        ! Upstream layers permitted: input1d, dense, dropout, flatten
+        select type(prev_layer => input % p)
+          type is(input1d_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(dense_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(dropout_layer)
+            call this_layer % forward(prev_layer % output)
+          type is(flatten_layer)
+            call this_layer % forward(prev_layer % output)
+        end select
+
+      type is(dropout_layer)
+
         ! Upstream layers permitted: input1d, dense, flatten
         select type(prev_layer => input % p)
           type is(input1d_layer)
@@ -301,17 +322,19 @@ impure elemental module subroutine init(self, input)
       call this_layer % init(input % layer_shape)
     end select
 
-    ! The shape of linear2d, conv2d, maxpool2d, or flatten layers is not known
-    ! until we receive an input layer.
+    ! The shape of conv2d, dropout, flatten, linear2d, or maxpool2d layers
+    ! is not known until we receive an input layer.
     select type(this_layer => self % p)
       type is(conv2d_layer)
         self % layer_shape = shape(this_layer % output)
-      type is(maxpool2d_layer)
+      type is(dropout_layer)
         self % layer_shape = shape(this_layer % output)
       type is(flatten_layer)
         self % layer_shape = shape(this_layer % output)
       type is(linear2d_layer)
         self % layer_shape = shape(this_layer % output)
+      type is(maxpool2d_layer)
+        self % layer_shape = shape(this_layer % output)
     end select
 
     self % input_layer_shape = input % layer_shape
@@ -328,9 +351,14 @@ impure elemental module subroutine print_info(self)
     if (.not. self % name == 'input') &
       print '("Input shape: ", *(i0, 1x))', self % input_layer_shape
     print '("Output shape: ", *(i0, 1x))', self % layer_shape
-    print '("Parameters: ", i0)', self % get_num_params()
-    if (.not. self % name == 'input') &
+    if (.not. self % name == 'dropout') &
+      print '("Parameters: ", i0)', self % get_num_params()
+    if (.not. (self % name == 'input' .or. self % name == 'dropout')) &
       print '("Activation: ", a)', self % activation
+    select type (this_layer => self % p)
+      type is (dropout_layer)
+        print '("Dropout rate: ", f0.2)', this_layer % dropout_rate
+    end select
     print *
   end subroutine print_info
 
@@ -349,6 +377,8 @@ elemental module function get_num_params(self) result(num_params)
         num_params = 0
       type is (dense_layer)
         num_params = this_layer % get_num_params()
+      type is (dropout_layer)
+        num_params = 0
       type is (conv2d_layer)
         num_params = this_layer % get_num_params()
       type is (maxpool2d_layer)
@@ -378,6 +408,8 @@ module function get_params(self) result(params)
          ! No parameters to get.
       type is (dense_layer)
         params = this_layer % get_params()
+      type is (dropout_layer)
+        ! No parameters to get.
       type is (conv2d_layer)
         params = this_layer % get_params()
       type is (maxpool2d_layer)
@@ -407,6 +439,8 @@ module function get_gradients(self) result(gradients)
         ! No gradients to get.
       type is (dense_layer)
         gradients = this_layer % get_gradients()
+      type is (dropout_layer)
+        ! No gradients to get.
       type is (conv2d_layer)
         gradients = this_layer % get_gradients()
       type is (maxpool2d_layer)
@@ -461,6 +495,11 @@ module subroutine set_params(self, params)
       type is (dense_layer)
         call this_layer % set_params(params)
 
+      type is (dropout_layer)
+        ! No parameters to set.
+        write(stderr, '(a)') 'Warning: calling set_params() ' &
+          // 'on a zero-parameter layer; nothing to do.'
+
       type is (conv2d_layer)
         call this_layer % set_params(params)
 
diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90
index fa7ea4eb..5916924e 100644
--- a/src/nf/nf_network.f90
+++ b/src/nf/nf_network.f90
@@ -26,6 +26,7 @@ module nf_network
     procedure :: get_params
     procedure :: print_info
     procedure :: set_params
+    procedure :: set_training_mode
     procedure :: train
     procedure :: update
 
@@ -223,6 +224,15 @@ module subroutine set_params(self, params)
         !! Network parameters to set
     end subroutine set_params
 
+    module subroutine set_training_mode(self, training)
+      !! Set the mode to training (.true.) or inference (.false.).
+      !! Used internally to enable/disable the dropout layers in the network.
+      class(network), intent(in out) :: self
+        !! Network instance
+      logical, intent(in) :: training
+        !! .true. for training mode, .false. for inference.
+    end subroutine set_training_mode
+
     module subroutine print_info(self)
       !! Prints a brief summary of the network and its layers to the screen.
       class(network), intent(in) :: self
diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90
index c2a9c903..dd632d96 100644
--- a/src/nf/nf_network_submodule.f90
+++ b/src/nf/nf_network_submodule.f90
@@ -2,6 +2,7 @@
 
   use nf_conv2d_layer, only: conv2d_layer
   use nf_dense_layer, only: dense_layer
+  use nf_dropout_layer, only: dropout_layer
   use nf_flatten_layer, only: flatten_layer
   use nf_input1d_layer, only: input1d_layer
   use nf_input2d_layer, only: input2d_layer
@@ -141,7 +142,8 @@ module subroutine backward(self, output, loss)
         select type(next_layer => self % layers(n + 1) % p)
           type is(dense_layer)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
-
+          type is(dropout_layer)
+            call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
           type is(conv2d_layer)
             call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient)
 
@@ -251,19 +253,27 @@ module function predict_1d(self, input) result(res)
     class(network), intent(in out) :: self
     real, intent(in) :: input(:)
     real, allocatable :: res(:)
-    integer :: num_layers
+    integer :: n, num_layers
 
     num_layers = size(self % layers)
 
+    ! predict is run in inference mode only;
+    ! set all dropout layers' training mode to false, and
+    ! return to training mode after inference.
+    call self % set_training_mode(.false.)
     call self % forward(input)
+    call self % set_training_mode(.true.)
 
     select type(output_layer => self % layers(num_layers) % p)
       type is(dense_layer)
         res = output_layer % output
+      type is(dropout_layer)
+        res = output_layer % output
       type is(flatten_layer)
         res = output_layer % output
       class default
-        error stop 'network % output not implemented for this output layer'
+        error stop 'network % output not implemented for ' // &
+          trim(self % layers(num_layers) % name) // ' layer'
     end select
 
   end function predict_1d
@@ -273,11 +283,16 @@ module function predict_2d(self, input) result(res)
     class(network), intent(in out) :: self
     real, intent(in) :: input(:,:)
     real, allocatable :: res(:)
-    integer :: num_layers
+    integer :: n, num_layers
 
     num_layers = size(self % layers)
 
+    ! predict is run in inference mode only;
+    ! set all dropout layers' training mode to false, and
+    ! return to training mode after inference.
+    call self % set_training_mode(.false.)
     call self % forward(input)
+    call self % set_training_mode(.true.)
 
     select type(output_layer => self % layers(num_layers) % p)
       type is(dense_layer)
@@ -285,7 +300,8 @@ module function predict_2d(self, input) result(res)
       type is(flatten_layer)
         res = output_layer % output
       class default
-        error stop 'network % output not implemented for this output layer'
+        error stop 'network % output not implemented for ' // &
+          trim(self % layers(num_layers) % name) // ' layer'
     end select
 
   end function predict_2d
@@ -295,11 +311,16 @@ module function predict_3d(self, input) result(res)
     class(network), intent(in out) :: self
     real, intent(in) :: input(:,:,:)
     real, allocatable :: res(:)
-    integer :: num_layers
+    integer :: n, num_layers
 
     num_layers = size(self % layers)
 
+    ! predict is run in inference mode only;
+    ! set all dropout layers' training mode to false, and
+    ! return to training mode after inference.
+    call self % set_training_mode(.false.)
     call self % forward(input)
+    call self % set_training_mode(.true.)
 
     select type(output_layer => self % layers(num_layers) % p)
       type is(conv2d_layer)
@@ -310,7 +331,8 @@ module function predict_3d(self, input) result(res)
       type is(flatten_layer)
         res = output_layer % output
       class default
-        error stop 'network % output not implemented for this output layer'
+        error stop 'network % output not implemented for ' // &
+          trim(self % layers(num_layers) % name) // ' layer'
     end select
 
   end function predict_3d
@@ -320,12 +342,17 @@ module function predict_batch_1d(self, input) result(res)
     class(network), intent(in out) :: self
     real, intent(in) :: input(:,:)
     real, allocatable :: res(:,:)
-    integer :: i, batch_size, num_layers, output_size
+    integer :: i, n, batch_size, num_layers, output_size
 
     num_layers = size(self % layers)
     batch_size = size(input, dim=rank(input))
     output_size = product(self % layers(num_layers) % layer_shape)
 
+    ! predict is run in inference mode only;
+    ! set all dropout layers' training mode to false, and
+    ! return to training mode after inference.
+    call self % set_training_mode(.false.)
+
     allocate(res(output_size, batch_size))
 
     batch: do i = 1, size(res, dim=2)
@@ -338,11 +365,16 @@ module function predict_batch_1d(self, input) result(res)
         type is(flatten_layer)
           res(:,i) = output_layer % output
         class default
-          error stop 'network % output not implemented for this output layer'
+          error stop 'network % output not implemented for ' // &
+            trim(self % layers(num_layers) % name) // ' layer'
       end select
 
     end do batch
 
+    ! We are now done with inference;
+    ! return to training mode for dropout layers.
+    call self % set_training_mode(.true.)
+
   end function predict_batch_1d
 
 
@@ -350,12 +382,17 @@ module function predict_batch_3d(self, input) result(res)
     class(network), intent(in out) :: self
     real, intent(in) :: input(:,:,:,:)
     real, allocatable :: res(:,:)
-    integer :: i, batch_size, num_layers, output_size
+    integer :: i, n, batch_size, num_layers, output_size
 
     num_layers = size(self % layers)
     batch_size = size(input, dim=rank(input))
     output_size = product(self % layers(num_layers) % layer_shape)
 
+    ! predict is run in inference mode only;
+    ! set all dropout layers' training mode to false, and
+    ! return to training mode after inference.
+    call self % set_training_mode(.false.)
+
     allocate(res(output_size, batch_size))
 
     batch: do i = 1, batch_size
@@ -371,11 +408,16 @@ module function predict_batch_3d(self, input) result(res)
         type is(flatten_layer)
           res(:,i) = output_layer % output
         class default
-          error stop 'network % output not implemented for this output layer'
+          error stop 'network % output not implemented for ' // &
+            trim(self % layers(num_layers) % name) // ' layer'
       end select
 
     end do batch
 
+    ! We are now done with inference;
+    ! return to training mode for dropout layers.
+    call self % set_training_mode(.true.)
+
   end function predict_batch_3d
 
 
@@ -455,6 +497,18 @@ module subroutine set_params(self, params)
   end subroutine set_params
 
 
+  module subroutine set_training_mode(self, training)
+    class(network), intent(in out) :: self
+    logical, intent(in) :: training
+    integer :: n
+    do n = 2, size(self % layers)
+      select type(this_layer => self % layers(n) % p); type is(dropout_layer)
+        this_layer % training = training
+      end select
+    end do
+  end subroutine set_training_mode
+
+
   module subroutine train(self, input_data, output_data, batch_size, &
                           epochs, optimizer, loss)
     class(network), intent(in out) :: self
diff --git a/src/nf/nf_random.f90 b/src/nf/nf_random.f90
index 57c5d11f..5160bc13 100644
--- a/src/nf/nf_random.f90
+++ b/src/nf/nf_random.f90
@@ -1,12 +1,12 @@
 module nf_random
 
-  !! Provides a random number generator with
-  !! normal distribution, centered on zero.
+  !! Provides a random number generator with normal distribution,
+  !! centered on zero, and a Fisher-Yates shuffle.
 
   implicit none
 
   private
-  public :: random_normal
+  public :: random_normal, shuffle
 
   real, parameter :: pi = 4 * atan(1.d0)
 
@@ -23,4 +23,22 @@ impure elemental subroutine random_normal(x)
     x = sqrt(- 2 * log(u(1))) * cos(2 * pi * u(2))
   end subroutine random_normal
 
+
+  subroutine shuffle(x)
+    !! Fisher-Yates shuffle.
+    real, intent(in out) :: x(:)
+      !! Array to shuffle
+    integer :: i, j
+    real :: r, temp
+
+    do i = size(x), 2, -1
+      call random_number(r)
+      j = floor(r * i) + 1
+      temp = x(i)
+      x(i) = x(j)
+      x(j) = temp
+    end do
+
+  end subroutine shuffle
+
 end module nf_random
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 12236416..1716dc8c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,7 @@ foreach(execid
   input1d_layer
   input2d_layer
   input3d_layer
+  dropout_layer
   linear2d_layer
   parametric_activation
   dense_layer
diff --git a/test/test_conv2d_network.f90 b/test/test_conv2d_network.f90
index 42d868df..1bdfc677 100644
--- a/test/test_conv2d_network.f90
+++ b/test/test_conv2d_network.f90
@@ -39,7 +39,7 @@ program test_conv2d_network
 
     type(network) :: cnn
     real :: y(1)
-    real :: tolerance = 1e-5
+    real :: tolerance = 1e-4
     integer :: n
     integer, parameter :: num_iterations = 1000
 
@@ -76,7 +76,7 @@ program test_conv2d_network
     type(network) :: cnn
     real :: x(1, 8, 8)
     real :: y(1)
-    real :: tolerance = 1e-5
+    real :: tolerance = 1e-4
     integer :: n
     integer, parameter :: num_iterations = 1000
 
@@ -111,7 +111,7 @@ program test_conv2d_network
     type(network) :: cnn
     real :: x(1, 12, 12)
     real :: y(9)
-    real :: tolerance = 1e-5
+    real :: tolerance = 1e-4
     integer :: n
     integer, parameter :: num_iterations = 5000
 
diff --git a/test/test_dropout_layer.f90 b/test/test_dropout_layer.f90
new file mode 100644
index 00000000..a79d0de5
--- /dev/null
+++ b/test/test_dropout_layer.f90
@@ -0,0 +1,243 @@
+program test_dropout_layer
+  use iso_fortran_env, only: stderr => error_unit
+  use nf, only: dense, dropout, input, layer, network
+  use nf_dropout_layer, only: dropout_layer
+  type(layer) :: layer1
+  type(network) :: net
+  integer :: input_size
+
+  logical :: ok = .true.
+
+  layer1 = dropout(0.5)
+
+  if (.not. layer1 % name == 'dropout') then
+    ok = .false.
+    write(stderr, '(a)') 'dropout layer has its name set correctly.. failed'
+  end if
+
+  ! Dropout on its own is not initialized and its arrays not allocated.
+  select type(layer1_p => layer1 % p)
+    type is(dropout_layer)
+
+      if (layer1_p % dropout_rate /= 0.5) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer dropout rate should be 0.5.. failed'
+      end if
+
+      if (.not. layer1_p % training) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer default training mode should be true.. failed'
+      end if
+
+      if (layer1_p % input_size /= 0) then
+        print *, 'input_size: ', layer1_p % input_size
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer size should be zero.. failed'
+      end if
+
+      if (allocated(layer1_p % output)) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer output array should not be allocated.. failed'
+      end if
+
+  end select
+
+  ! Now we're gonna initialize a minimal network with an input layer and a
+  ! dropout that follows and we'll check that the dropout layer has expected
+  ! state.
+  input_size = 10
+  net = network([ &
+    input(input_size), &
+    dropout(0.5) &
+  ])
+
+  select type(layer1_p => net % layers(1) % p)
+    type is(dropout_layer)
+      if (layer1_p % input_size /= input_size) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer input size should be the same as the input layer.. failed'
+      end if
+
+      if (.not. allocated(layer1_p % output)) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer output array should be allocated.. failed'
+      end if
+
+      if (.not. allocated(layer1_p % gradient)) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer gradient array should be allocated.. failed'
+      end if
+
+      if (.not. allocated(layer1_p % mask)) then
+        ok = .false.
+        write(stderr, '(a)') 'dropout layer mask array should be allocated.. failed'
+      end if
+
+  end select
+
+  ! Test that the generated dropout mask matches the requested dropout rate.
+  test_mask: block
+    integer, parameter :: input_sizes(3) = [10, 100, 1000]
+    real, parameter :: dropout_rates(5) = [0., 0.2, 0.5, 0.8, 1.]
+    real, allocatable :: input_data(:)
+    integer :: i, j
+
+    do i = 1, size(input_sizes)
+      do j = 1, size(dropout_rates)
+
+        net = network([ &
+          input(input_sizes(i)), &
+          dropout(dropout_rates(j)) &
+        ])
+
+        if (allocated(input_data)) deallocate(input_data)
+        allocate(input_data(input_sizes(i)))
+        call random_number(input_data)
+
+        call net % forward(input_data)
+
+        select type(layer1_p => net % layers(2) % p)
+          type is(dropout_layer)
+            if (abs(sum(layer1_p % mask) / size(layer1_p % mask) - (1 - dropout_rates(j))) > 1e-6) then
+              ok = .false.
+              write(stderr, '(a)') 'actual dropout rate is equal to requested.. failed'
+            end if
+        end select
+      end do
+    end do
+
+  end block test_mask
+
+
+  ! Now we're gonna run the forward pass and check that the dropout indeed
+  ! drops according to the requested dropout rate.
+  forward_pass: block
+    real :: input_data(10)
+    real :: output_data(size(input_data))
+    real, parameter :: dropout_rate = 0.2
+    real :: realized_dropout_rate
+    integer :: n
+
+    net = network([ &
+      input(size(input_data)), &
+      dropout(dropout_rate) &
+    ])
+
+    do n = 1, 100
+
+      call random_number(input_data)
+      call net % forward(input_data)
+
+      ! Check that sum of output matches sum of input within small tolerance
+      select type(layer1_p => net % layers(2) % p)
+        type is(dropout_layer)
+          realized_dropout_rate = 1 - sum(input_data * layer1_p % mask) / sum(layer1_p % output)
+          if (abs(realized_dropout_rate - dropout_rate) > 1e-6) then
+            ok = .false.
+            write(stderr, '(a)') 'realized dropout rate does not match requested dropout rate.. failed'
+          end if
+      end select
+
+    end do
+
+    if (.not. ok) write(stderr, '(a)') &
+      'dropout layer output sum should match input sum within tolerance.. failed'
+
+  end block forward_pass
+
+
+  training: block
+    real :: x(20), y(5)
+    real :: tolerance = 1e-4
+    integer :: n
+    integer, parameter :: num_iterations = 100000
+
+    call random_number(x)
+    y = [0.12345, 0.23456, 0.34567, 0.45678, 0.56789]
+
+    net = network([ &
+      input(20), &
+      dense(20), &
+      dropout(0.2), &
+      dense(5) &
+    ])
+
+    do n = 1, num_iterations
+      call net % forward(x)
+      call net % backward(y)
+      call net % update()
+      if (all(abs(net % predict(x) - y) < tolerance)) exit
+    end do
+
+    if (.not. n <= num_iterations) then
+      write(stderr, '(a)') &
+        'dense network should converge in simple training.. failed'
+      ok = .false.
+    end if
+
+  end block training
+
+  ! The following timing test is not part of the unit tests, but it's a good
+  ! way to see the performance difference between a network with and without
+  ! dropout.
+  timing: block
+    integer, parameter :: layer_size = 100
+    integer, parameter :: num_iterations = 1000
+    real :: x(layer_size), y(layer_size)
+    integer :: n
+    type(network) :: net1, net2
+    real :: t1, t2
+    real :: accumulated_time1 = 0
+    real :: accumulated_time2 = 0
+
+    net1 = network([ &
+      input(layer_size), &
+      dense(layer_size), &
+      dense(layer_size) &
+    ])
+
+    net2 = network([ &
+      input(layer_size), &
+      dense(layer_size), &
+      dropout(0.5), &
+      dense(layer_size) &
+    ])
+
+    call random_number(y)
+
+    ! Network without dropout
+    do n = 1, num_iterations
+      call random_number(x)
+      call cpu_time(t1)
+      call net1 % forward(x)
+      call net1 % backward(y)
+      call net1 % update()
+      call cpu_time(t2)
+      accumulated_time1 = accumulated_time1 + (t2 - t1)
+    end do
+
+    ! Network with dropout
+    do n = 1, num_iterations
+      call random_number(x)
+      call cpu_time(t1)
+      call net2 % forward(x)
+      call net2 % backward(y)
+      call net2 % update()
+      call cpu_time(t2)
+      accumulated_time2 = accumulated_time2 + (t2 - t1)
+    end do
+
+    ! Uncomment the following prints to see the timing results.
+    !print '(a, f9.6, a, f9.6, a)', 'No dropout time: ', accumulated_time1, ' seconds'
+    !print '(a, f9.6, a, f9.6, a)', 'Dropout time: ', accumulated_time2, ' seconds'
+
+  end block timing
+
+  if (ok) then
+    print '(a)', 'test_dropout_layer: All tests passed.'
+  else
+    write(stderr, '(a)') 'test_dropout_layer: One or more tests failed.'
+    stop 1
+  end if
+
+end program test_dropout_layer