Dropout layer (modern-fortran#194)

* First stab at dropout; conflict with base type TODO * Partial dropout integration * Test uninitialized dropout layer * Test dropout state that follows an input layer * Enable forward pass for dropout; backward pass TODO * Version bump and add dropout to the features table * Add dropout to CMake * Enable preprocessing in fpm.toml (needed with recent versions of fpm) * Small change in scale implementation * Integration of backward pass for dropout * Reduce tolerance in conv2d convergence tests * Fix bug in dropout scaling Co-authored-by: Ricardo Orsi <@ricor07> * disable dropout in inference mode (net % predict); TODO enable in net % train * Set dropout's training mode to true in net % train(); add tests * WIP dropout tests * Dropout layers always in training mode; except when is called, when they are in inference mode * Update the layers table * Ensure the actual dropout rate == requested dropout rate in most cases * Accumulate the gradient in dropout % backward and flush in network % update * Guard against bad dropout rate * Connect the backward pass; expand tests * Expand tests * Use the reference scaling in dropout; don't accumulate gradients because it's not needed * Add dropout to MNIST example; small model changes * Add reference * Update print_info dropout * Update print_info * Compute scale once in dropout constructor * dropout % backward() doesn't need input from the previous layer * Timing info of dropout --------- Co-authored-by: Vandenplas, Jeremie <[email protected]>
milancurcic · Feb 21, 2025 · 039638d · 039638d
1 parent c316ee1
commit 039638d
Show file tree

Hide file tree

Showing 16 changed files with 582 additions and 34 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,6 +55,8 @@ add_library(neural-fortran
   src/nf/nf_reshape_layer_submodule.f90
   src/nf/io/nf_io_binary.f90
   src/nf/io/nf_io_binary_submodule.f90
+  src/nf/nf_dropout_layer.f90
+  src/nf/nf_dropout_layer_submodule.f90
 )
 
 target_link_libraries(neural-fortran PRIVATE)

diff --git a/README.md b/README.md
@@ -30,11 +30,12 @@ Read the paper [here](https://arxiv.org/abs/1902.06714).
 | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass |
 |------------|------------------|------------------------|----------------------|--------------|---------------|
 | Input | `input` | n/a | 1, 2, 3 | n/a | n/a |
-| Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ |
+| Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ |
+| Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ |
 | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) |
 | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ |
 | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ |
-| Linear (2-d) | `linear2d` | `input2d` | 2 | ✅ | ✅ |
+| Linear (2-d) | `linear2d` | `input2d`, `linear2d` | 2 | ✅ | ✅ |
 | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ |
 
 (*) See Issue [#145](https://github.com/modern-fortran/neural-fortran/issues/145) regarding non-converging CNN training on the MNIST dataset.

diff --git a/example/dense_mnist.f90 b/example/dense_mnist.f90
@@ -1,6 +1,6 @@
 program dense_mnist
 
-  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr
+  use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr, relu, softmax, dropout
 
   implicit none
 
@@ -17,8 +17,9 @@ program dense_mnist
 
   net = network([ &
     input(784), &
-    dense(30), &
-    dense(10) &
+    dense(64, relu()), &
+    dropout(0.2), &
+    dense(10, softmax()) &
   ])
   num_epochs = 10
 
@@ -32,7 +33,7 @@ program dense_mnist
     call net % train( &
       training_images, &
       label_digits(training_labels), &
-      batch_size=100, &
+      batch_size=128, &
       epochs=1, &
       optimizer=sgd(learning_rate=3.) &
     )

diff --git a/src/nf.f90 b/src/nf.f90
@@ -3,7 +3,7 @@ module nf
   use nf_datasets_mnist, only: label_digits, load_mnist
   use nf_layer, only: layer
   use nf_layer_constructors, only: &
-    conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+    conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
   use nf_loss, only: mse, quadratic
   use nf_metrics, only: corr, maxabs
   use nf_network, only: network

diff --git a/src/nf/nf_dropout_layer.f90 b/src/nf/nf_dropout_layer.f90
@@ -0,0 +1,83 @@
+module nf_dropout_layer
+
+  !! Dropout layer by Srivastava et al. (2014).
+  !!
+  !! Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I. and 
+  !! Salakhutdinov, R., 2014. Dropout: a simple way to prevent neural networks 
+  !! from overfitting. The Journal of Machine Learning Research, 16(1), 
+  !! pp.1929-1958.
+
+  use nf_base_layer, only: base_layer
+
+  implicit none
+
+  private
+  public :: dropout_layer
+
+  type, extends(base_layer) :: dropout_layer
+    !! Concrete implementation of a dropout layer type
+
+    integer :: input_size = 0
+
+    real, allocatable :: output(:)
+    real, allocatable :: gradient(:)
+    real, allocatable :: mask(:) ! binary mask for dropout
+
+    real :: dropout_rate ! probability of dropping a neuron
+    real :: scale ! scale factor to preserve the input sum
+    logical :: training = .true. ! set to .false. for inference
+
+  contains
+
+    procedure :: backward
+    procedure :: forward
+    procedure :: init
+
+  end type dropout_layer
+
+  interface dropout_layer
+    module function dropout_layer_cons(rate) &
+      result(res)
+      !! This function returns the `dropout_layer` instance.
+      real, intent(in) :: rate
+        !! Dropout rate
+      type(dropout_layer) :: res
+        !! dropout_layer instance
+    end function dropout_layer_cons
+  end interface dropout_layer
+
+  interface
+
+    pure module subroutine backward(self, gradient)
+      !! Apply the backward gradient descent pass.
+      !! Only weight and bias gradients are updated in this subroutine,
+      !! while the weights and biases themselves are untouched.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      real, intent(in) :: gradient(:)
+        !! Gradient from the next layer
+    end subroutine backward
+
+    module subroutine forward(self, input)
+      !! Propagate forward the layer.
+      !! Calling this subroutine updates the values of a few data components
+      !! of `dropout_layer` that are needed for the backward pass.
+      class(dropout_layer), intent(in out) :: self
+        !! Dense layer instance
+      real, intent(in) :: input(:)
+        !! Input from the previous layer
+    end subroutine forward
+
+    module subroutine init(self, input_shape)
+      !! Initialize the layer data structures.
+      !!
+      !! This is a deferred procedure from the `base_layer` abstract type.
+      class(dropout_layer), intent(in out) :: self
+        !! Dropout layer instance
+      integer, intent(in) :: input_shape(:)
+        !! Shape of the input layer
+    end subroutine init
+
+  end interface
+
+end module nf_dropout_layer
diff --git a/src/nf/nf_dropout_layer_submodule.f90 b/src/nf/nf_dropout_layer_submodule.f90
@@ -0,0 +1,68 @@
+submodule (nf_dropout_layer) nf_dropout_layer_submodule
+  use nf_random, only: shuffle
+  !! This submodule implements the procedures defined in the
+  !! nf_dropout_layer module.
+
+contains
+
+  module function dropout_layer_cons(rate) result(res)
+    real, intent(in) :: rate
+    type(dropout_layer) :: res
+    res % dropout_rate = rate
+    res % scale = 1 / (1 - rate)
+  end function dropout_layer_cons
+
+
+  module subroutine init(self, input_shape)
+    class(dropout_layer), intent(in out) :: self
+    integer, intent(in) :: input_shape(:)
+
+    self % input_size = input_shape(1)
+
+    ! Allocate arrays
+    allocate(self % output(self % input_size))
+    allocate(self % gradient(self % input_size))
+    allocate(self % mask(self % input_size))
+
+    ! Initialize arrays
+    self % output = 0
+    self % gradient = 0
+    self % mask = 1  ! Default mask is all ones (no dropout)
+
+  end subroutine init
+
+
+  module subroutine forward(self, input)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: input(:)
+
+    ! Generate random mask for dropout, training mode only
+    if (self % training) then
+
+      ! Set the first dropout_rate number of elements to 0, the rest to 1,
+      ! and shuffle. Note that the selection of the elements rounds down to
+      ! the nearest integer, so in cases where size(input) * dropout_rate is
+      ! not an integer, the actual dropout rate will be slightly lower.
+      self % mask = 1
+      self % mask(:int(size(self % mask) * self % dropout_rate)) = 0
+      call shuffle(self % mask)
+
+      ! Apply dropout mask
+      self % output = input * self % mask * self % scale
+
+    else
+      ! In inference mode, we don't apply dropout; simply pass through the input
+      self % output = input
+
+    end if
+
+  end subroutine forward
+
+
+  pure module subroutine backward(self, gradient)
+    class(dropout_layer), intent(in out) :: self
+    real, intent(in) :: gradient(:)
+    self % gradient = gradient * self % mask * self % scale
+  end subroutine backward
+
+end submodule nf_dropout_layer_submodule 
diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90
@@ -91,7 +91,7 @@ end subroutine backward_3d
 
   interface
 
-    pure module subroutine forward(self, input)
+    module subroutine forward(self, input)
       !! Apply a forward pass on the layer.
       !! This changes the internal state of the layer.
       !! This is normally called internally by the `network % forward`

diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -8,7 +8,7 @@ module nf_layer_constructors
   implicit none
 
   private
-  public :: conv2d, dense, flatten, input, maxpool2d, reshape, linear2d
+  public :: conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape
 
   interface input
 
@@ -104,6 +104,24 @@ module function dense(layer_size, activation) result(res)
         !! Resulting layer instance
     end function dense
 
+    module function dropout(rate) result(res)
+      !! Create a dropout layer with a given dropout rate.
+      !!
+      !! This layer is for randomly disabling neurons during training.
+      !!
+      !! Example:
+      !!
+      !! ```
+      !! use nf, only :: dropout, layer
+      !! type(layer) :: dropout_layer
+      !! dropout_layer = dropout(rate=0.5)
+      !! ```
+      real, intent(in) :: rate
+        !! Dropout rate - fraction of neurons to randomly disable during training
+      type(layer) :: res
+        !! Resulting layer instance
+    end function dropout
+
     module function flatten() result(res)
       !! Flatten (3-d -> 1-d) layer constructor.
       !!

diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -3,6 +3,7 @@
   use nf_layer, only: layer
   use nf_conv2d_layer, only: conv2d_layer
   use nf_dense_layer, only: dense_layer
+  use nf_dropout_layer, only: dropout_layer
   use nf_flatten_layer, only: flatten_layer
   use nf_input1d_layer, only: input1d_layer
   use nf_input2d_layer, only: input2d_layer
@@ -65,14 +66,23 @@ module function dense(layer_size, activation) result(res)
   end function dense
 
 
+  module function dropout(rate) result(res)
+    real, intent(in) :: rate
+    type(layer) :: res
+    if (rate < 0 .or. rate > 1) &
+      error stop 'rate must be between 0 and 1 in a dropout layer'
+    res % name = 'dropout'
+    allocate(res % p, source=dropout_layer(rate))
+  end function dropout
+
+
   module function flatten() result(res)
     type(layer) :: res
     res % name = 'flatten'
     allocate(res % p, source=flatten_layer())
   end function flatten
 
 
-
   module function input1d(layer_size) result(res)
     integer, intent(in) :: layer_size
     type(layer) :: res