From 039638d9364fd3a01eee5c2017612db4d1a39425 Mon Sep 17 00:00:00 2001 From: Milan Curcic Date: Fri, 21 Feb 2025 12:38:53 -0500 Subject: [PATCH] Dropout layer (#194) * First stab at dropout; conflict with base type TODO * Partial dropout integration * Test uninitialized dropout layer * Test dropout state that follows an input layer * Enable forward pass for dropout; backward pass TODO * Version bump and add dropout to the features table * Add dropout to CMake * Enable preprocessing in fpm.toml (needed with recent versions of fpm) * Small change in scale implementation * Integration of backward pass for dropout * Reduce tolerance in conv2d convergence tests * Fix bug in dropout scaling Co-authored-by: Ricardo Orsi <@ricor07> * disable dropout in inference mode (net % predict); TODO enable in net % train * Set dropout's training mode to true in net % train(); add tests * WIP dropout tests * Dropout layers always in training mode; except when is called, when they are in inference mode * Update the layers table * Ensure the actual dropout rate == requested dropout rate in most cases * Accumulate the gradient in dropout % backward and flush in network % update * Guard against bad dropout rate * Connect the backward pass; expand tests * Expand tests * Use the reference scaling in dropout; don't accumulate gradients because it's not needed * Add dropout to MNIST example; small model changes * Add reference * Update print_info dropout * Update print_info * Compute scale once in dropout constructor * dropout % backward() doesn't need input from the previous layer * Timing info of dropout --------- Co-authored-by: Vandenplas, Jeremie --- CMakeLists.txt | 2 + README.md | 5 +- example/dense_mnist.f90 | 9 +- src/nf.f90 | 2 +- src/nf/nf_dropout_layer.f90 | 83 +++++++ src/nf/nf_dropout_layer_submodule.f90 | 68 ++++++ src/nf/nf_layer.f90 | 2 +- src/nf/nf_layer_constructors.f90 | 20 +- src/nf/nf_layer_constructors_submodule.f90 | 12 +- src/nf/nf_layer_submodule.f90 | 53 ++++- src/nf/nf_network.f90 | 10 + src/nf/nf_network_submodule.f90 | 76 ++++++- src/nf/nf_random.f90 | 24 +- test/CMakeLists.txt | 1 + test/test_conv2d_network.f90 | 6 +- test/test_dropout_layer.f90 | 243 +++++++++++++++++++++ 16 files changed, 582 insertions(+), 34 deletions(-) create mode 100644 src/nf/nf_dropout_layer.f90 create mode 100644 src/nf/nf_dropout_layer_submodule.f90 create mode 100644 test/test_dropout_layer.f90 diff --git a/CMakeLists.txt b/CMakeLists.txt index fc2ddfcb..eda96b28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,8 @@ add_library(neural-fortran src/nf/nf_reshape_layer_submodule.f90 src/nf/io/nf_io_binary.f90 src/nf/io/nf_io_binary_submodule.f90 + src/nf/nf_dropout_layer.f90 + src/nf/nf_dropout_layer_submodule.f90 ) target_link_libraries(neural-fortran PRIVATE) diff --git a/README.md b/README.md index ebf7704d..a0eee745 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,12 @@ Read the paper [here](https://arxiv.org/abs/1902.06714). | Layer type | Constructor name | Supported input layers | Rank of output array | Forward pass | Backward pass | |------------|------------------|------------------------|----------------------|--------------|---------------| | Input | `input` | n/a | 1, 2, 3 | n/a | n/a | -| Dense (fully-connected) | `dense` | `input1d`, `flatten` | 1 | ✅ | ✅ | +| Dense (fully-connected) | `dense` | `input1d`, `dense`, `dropout`, `flatten` | 1 | ✅ | ✅ | +| Dropout | `dropout` | `dense`, `flatten`, `input1d` | 1 | ✅ | ✅ | | Convolutional (2-d) | `conv2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅(*) | | Max-pooling (2-d) | `maxpool2d` | `input3d`, `conv2d`, `maxpool2d`, `reshape` | 3 | ✅ | ✅ | | Flatten | `flatten` | `input2d`, `input3d`, `conv2d`, `maxpool2d`, `reshape` | 1 | ✅ | ✅ | -| Linear (2-d) | `linear2d` | `input2d` | 2 | ✅ | ✅ | +| Linear (2-d) | `linear2d` | `input2d`, `linear2d` | 2 | ✅ | ✅ | | Reshape (1-d to 3-d) | `reshape` | `input1d`, `dense`, `flatten` | 3 | ✅ | ✅ | (*) See Issue [#145](https://github.com/modern-fortran/neural-fortran/issues/145) regarding non-converging CNN training on the MNIST dataset. diff --git a/example/dense_mnist.f90 b/example/dense_mnist.f90 index c26d0ced..c1db2da4 100644 --- a/example/dense_mnist.f90 +++ b/example/dense_mnist.f90 @@ -1,6 +1,6 @@ program dense_mnist - use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr + use nf, only: dense, input, network, sgd, label_digits, load_mnist, corr, relu, softmax, dropout implicit none @@ -17,8 +17,9 @@ program dense_mnist net = network([ & input(784), & - dense(30), & - dense(10) & + dense(64, relu()), & + dropout(0.2), & + dense(10, softmax()) & ]) num_epochs = 10 @@ -32,7 +33,7 @@ program dense_mnist call net % train( & training_images, & label_digits(training_labels), & - batch_size=100, & + batch_size=128, & epochs=1, & optimizer=sgd(learning_rate=3.) & ) diff --git a/src/nf.f90 b/src/nf.f90 index e9b027c1..7a989ea3 100644 --- a/src/nf.f90 +++ b/src/nf.f90 @@ -3,7 +3,7 @@ module nf use nf_datasets_mnist, only: label_digits, load_mnist use nf_layer, only: layer use nf_layer_constructors, only: & - conv2d, dense, flatten, input, maxpool2d, reshape, linear2d + conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape use nf_loss, only: mse, quadratic use nf_metrics, only: corr, maxabs use nf_network, only: network diff --git a/src/nf/nf_dropout_layer.f90 b/src/nf/nf_dropout_layer.f90 new file mode 100644 index 00000000..f7165aa0 --- /dev/null +++ b/src/nf/nf_dropout_layer.f90 @@ -0,0 +1,83 @@ +module nf_dropout_layer + + !! Dropout layer by Srivastava et al. (2014). + !! + !! Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I. and + !! Salakhutdinov, R., 2014. Dropout: a simple way to prevent neural networks + !! from overfitting. The Journal of Machine Learning Research, 16(1), + !! pp.1929-1958. + + use nf_base_layer, only: base_layer + + implicit none + + private + public :: dropout_layer + + type, extends(base_layer) :: dropout_layer + !! Concrete implementation of a dropout layer type + + integer :: input_size = 0 + + real, allocatable :: output(:) + real, allocatable :: gradient(:) + real, allocatable :: mask(:) ! binary mask for dropout + + real :: dropout_rate ! probability of dropping a neuron + real :: scale ! scale factor to preserve the input sum + logical :: training = .true. ! set to .false. for inference + + contains + + procedure :: backward + procedure :: forward + procedure :: init + + end type dropout_layer + + interface dropout_layer + module function dropout_layer_cons(rate) & + result(res) + !! This function returns the `dropout_layer` instance. + real, intent(in) :: rate + !! Dropout rate + type(dropout_layer) :: res + !! dropout_layer instance + end function dropout_layer_cons + end interface dropout_layer + + interface + + pure module subroutine backward(self, gradient) + !! Apply the backward gradient descent pass. + !! Only weight and bias gradients are updated in this subroutine, + !! while the weights and biases themselves are untouched. + class(dropout_layer), intent(in out) :: self + !! Dropout layer instance + real, intent(in) :: gradient(:) + !! Gradient from the next layer + end subroutine backward + + module subroutine forward(self, input) + !! Propagate forward the layer. + !! Calling this subroutine updates the values of a few data components + !! of `dropout_layer` that are needed for the backward pass. + class(dropout_layer), intent(in out) :: self + !! Dense layer instance + real, intent(in) :: input(:) + !! Input from the previous layer + end subroutine forward + + module subroutine init(self, input_shape) + !! Initialize the layer data structures. + !! + !! This is a deferred procedure from the `base_layer` abstract type. + class(dropout_layer), intent(in out) :: self + !! Dropout layer instance + integer, intent(in) :: input_shape(:) + !! Shape of the input layer + end subroutine init + + end interface + +end module nf_dropout_layer diff --git a/src/nf/nf_dropout_layer_submodule.f90 b/src/nf/nf_dropout_layer_submodule.f90 new file mode 100644 index 00000000..3fe07b1a --- /dev/null +++ b/src/nf/nf_dropout_layer_submodule.f90 @@ -0,0 +1,68 @@ +submodule (nf_dropout_layer) nf_dropout_layer_submodule + use nf_random, only: shuffle + !! This submodule implements the procedures defined in the + !! nf_dropout_layer module. + +contains + + module function dropout_layer_cons(rate) result(res) + real, intent(in) :: rate + type(dropout_layer) :: res + res % dropout_rate = rate + res % scale = 1 / (1 - rate) + end function dropout_layer_cons + + + module subroutine init(self, input_shape) + class(dropout_layer), intent(in out) :: self + integer, intent(in) :: input_shape(:) + + self % input_size = input_shape(1) + + ! Allocate arrays + allocate(self % output(self % input_size)) + allocate(self % gradient(self % input_size)) + allocate(self % mask(self % input_size)) + + ! Initialize arrays + self % output = 0 + self % gradient = 0 + self % mask = 1 ! Default mask is all ones (no dropout) + + end subroutine init + + + module subroutine forward(self, input) + class(dropout_layer), intent(in out) :: self + real, intent(in) :: input(:) + + ! Generate random mask for dropout, training mode only + if (self % training) then + + ! Set the first dropout_rate number of elements to 0, the rest to 1, + ! and shuffle. Note that the selection of the elements rounds down to + ! the nearest integer, so in cases where size(input) * dropout_rate is + ! not an integer, the actual dropout rate will be slightly lower. + self % mask = 1 + self % mask(:int(size(self % mask) * self % dropout_rate)) = 0 + call shuffle(self % mask) + + ! Apply dropout mask + self % output = input * self % mask * self % scale + + else + ! In inference mode, we don't apply dropout; simply pass through the input + self % output = input + + end if + + end subroutine forward + + + pure module subroutine backward(self, gradient) + class(dropout_layer), intent(in out) :: self + real, intent(in) :: gradient(:) + self % gradient = gradient * self % mask * self % scale + end subroutine backward + +end submodule nf_dropout_layer_submodule \ No newline at end of file diff --git a/src/nf/nf_layer.f90 b/src/nf/nf_layer.f90 index 33d1c773..517622b0 100644 --- a/src/nf/nf_layer.f90 +++ b/src/nf/nf_layer.f90 @@ -91,7 +91,7 @@ end subroutine backward_3d interface - pure module subroutine forward(self, input) + module subroutine forward(self, input) !! Apply a forward pass on the layer. !! This changes the internal state of the layer. !! This is normally called internally by the `network % forward` diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90 index 2983ddcd..87ceeeea 100644 --- a/src/nf/nf_layer_constructors.f90 +++ b/src/nf/nf_layer_constructors.f90 @@ -8,7 +8,7 @@ module nf_layer_constructors implicit none private - public :: conv2d, dense, flatten, input, maxpool2d, reshape, linear2d + public :: conv2d, dense, dropout, flatten, input, linear2d, maxpool2d, reshape interface input @@ -104,6 +104,24 @@ module function dense(layer_size, activation) result(res) !! Resulting layer instance end function dense + module function dropout(rate) result(res) + !! Create a dropout layer with a given dropout rate. + !! + !! This layer is for randomly disabling neurons during training. + !! + !! Example: + !! + !! ``` + !! use nf, only :: dropout, layer + !! type(layer) :: dropout_layer + !! dropout_layer = dropout(rate=0.5) + !! ``` + real, intent(in) :: rate + !! Dropout rate - fraction of neurons to randomly disable during training + type(layer) :: res + !! Resulting layer instance + end function dropout + module function flatten() result(res) !! Flatten (3-d -> 1-d) layer constructor. !! diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90 index ae7d05dc..9558a0bc 100644 --- a/src/nf/nf_layer_constructors_submodule.f90 +++ b/src/nf/nf_layer_constructors_submodule.f90 @@ -3,6 +3,7 @@ use nf_layer, only: layer use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer + use nf_dropout_layer, only: dropout_layer use nf_flatten_layer, only: flatten_layer use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer @@ -65,6 +66,16 @@ module function dense(layer_size, activation) result(res) end function dense + module function dropout(rate) result(res) + real, intent(in) :: rate + type(layer) :: res + if (rate < 0 .or. rate > 1) & + error stop 'rate must be between 0 and 1 in a dropout layer' + res % name = 'dropout' + allocate(res % p, source=dropout_layer(rate)) + end function dropout + + module function flatten() result(res) type(layer) :: res res % name = 'flatten' @@ -72,7 +83,6 @@ module function flatten() result(res) end function flatten - module function input1d(layer_size) result(res) integer, intent(in) :: layer_size type(layer) :: res diff --git a/src/nf/nf_layer_submodule.f90 b/src/nf/nf_layer_submodule.f90 index 22eabe9e..701dfe29 100644 --- a/src/nf/nf_layer_submodule.f90 +++ b/src/nf/nf_layer_submodule.f90 @@ -3,6 +3,7 @@ use iso_fortran_env, only: stderr => error_unit use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer + use nf_dropout_layer, only: dropout_layer use nf_flatten_layer, only: flatten_layer use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer @@ -26,16 +27,22 @@ pure module subroutine backward_1d(self, previous, gradient) type is(dense_layer) - ! Upstream layers permitted: input1d, dense, flatten + ! Upstream layers permitted: input1d, dense, dropout, flatten select type(prev_layer => previous % p) type is(input1d_layer) call this_layer % backward(prev_layer % output, gradient) type is(dense_layer) call this_layer % backward(prev_layer % output, gradient) + type is(dropout_layer) + call this_layer % backward(prev_layer % output, gradient) type is(flatten_layer) call this_layer % backward(prev_layer % output, gradient) end select + type is(dropout_layer) + ! Upstream layers permitted: input1d, dense, dropout, flatten + call this_layer % backward(gradient) + type is(flatten_layer) ! Upstream layers permitted: input2d, input3d, conv2d, maxpool2d @@ -134,7 +141,7 @@ pure module subroutine backward_3d(self, previous, gradient) end subroutine backward_3d - pure module subroutine forward(self, input) + module subroutine forward(self, input) implicit none class(layer), intent(in out) :: self class(layer), intent(in) :: input @@ -143,6 +150,20 @@ pure module subroutine forward(self, input) type is(dense_layer) + ! Upstream layers permitted: input1d, dense, dropout, flatten + select type(prev_layer => input % p) + type is(input1d_layer) + call this_layer % forward(prev_layer % output) + type is(dense_layer) + call this_layer % forward(prev_layer % output) + type is(dropout_layer) + call this_layer % forward(prev_layer % output) + type is(flatten_layer) + call this_layer % forward(prev_layer % output) + end select + + type is(dropout_layer) + ! Upstream layers permitted: input1d, dense, flatten select type(prev_layer => input % p) type is(input1d_layer) @@ -301,17 +322,19 @@ impure elemental module subroutine init(self, input) call this_layer % init(input % layer_shape) end select - ! The shape of linear2d, conv2d, maxpool2d, or flatten layers is not known - ! until we receive an input layer. + ! The shape of conv2d, dropout, flatten, linear2d, or maxpool2d layers + ! is not known until we receive an input layer. select type(this_layer => self % p) type is(conv2d_layer) self % layer_shape = shape(this_layer % output) - type is(maxpool2d_layer) + type is(dropout_layer) self % layer_shape = shape(this_layer % output) type is(flatten_layer) self % layer_shape = shape(this_layer % output) type is(linear2d_layer) self % layer_shape = shape(this_layer % output) + type is(maxpool2d_layer) + self % layer_shape = shape(this_layer % output) end select self % input_layer_shape = input % layer_shape @@ -328,9 +351,14 @@ impure elemental module subroutine print_info(self) if (.not. self % name == 'input') & print '("Input shape: ", *(i0, 1x))', self % input_layer_shape print '("Output shape: ", *(i0, 1x))', self % layer_shape - print '("Parameters: ", i0)', self % get_num_params() - if (.not. self % name == 'input') & + if (.not. self % name == 'dropout') & + print '("Parameters: ", i0)', self % get_num_params() + if (.not. (self % name == 'input' .or. self % name == 'dropout')) & print '("Activation: ", a)', self % activation + select type (this_layer => self % p) + type is (dropout_layer) + print '("Dropout rate: ", f0.2)', this_layer % dropout_rate + end select print * end subroutine print_info @@ -349,6 +377,8 @@ elemental module function get_num_params(self) result(num_params) num_params = 0 type is (dense_layer) num_params = this_layer % get_num_params() + type is (dropout_layer) + num_params = 0 type is (conv2d_layer) num_params = this_layer % get_num_params() type is (maxpool2d_layer) @@ -378,6 +408,8 @@ module function get_params(self) result(params) ! No parameters to get. type is (dense_layer) params = this_layer % get_params() + type is (dropout_layer) + ! No parameters to get. type is (conv2d_layer) params = this_layer % get_params() type is (maxpool2d_layer) @@ -407,6 +439,8 @@ module function get_gradients(self) result(gradients) ! No gradients to get. type is (dense_layer) gradients = this_layer % get_gradients() + type is (dropout_layer) + ! No gradients to get. type is (conv2d_layer) gradients = this_layer % get_gradients() type is (maxpool2d_layer) @@ -461,6 +495,11 @@ module subroutine set_params(self, params) type is (dense_layer) call this_layer % set_params(params) + type is (dropout_layer) + ! No parameters to set. + write(stderr, '(a)') 'Warning: calling set_params() ' & + // 'on a zero-parameter layer; nothing to do.' + type is (conv2d_layer) call this_layer % set_params(params) diff --git a/src/nf/nf_network.f90 b/src/nf/nf_network.f90 index fa7ea4eb..5916924e 100644 --- a/src/nf/nf_network.f90 +++ b/src/nf/nf_network.f90 @@ -26,6 +26,7 @@ module nf_network procedure :: get_params procedure :: print_info procedure :: set_params + procedure :: set_training_mode procedure :: train procedure :: update @@ -223,6 +224,15 @@ module subroutine set_params(self, params) !! Network parameters to set end subroutine set_params + module subroutine set_training_mode(self, training) + !! Set the mode to training (.true.) or inference (.false.). + !! Used internally to enable/disable the dropout layers in the network. + class(network), intent(in out) :: self + !! Network instance + logical, intent(in) :: training + !! .true. for training mode, .false. for inference. + end subroutine set_training_mode + module subroutine print_info(self) !! Prints a brief summary of the network and its layers to the screen. class(network), intent(in) :: self diff --git a/src/nf/nf_network_submodule.f90 b/src/nf/nf_network_submodule.f90 index c2a9c903..dd632d96 100644 --- a/src/nf/nf_network_submodule.f90 +++ b/src/nf/nf_network_submodule.f90 @@ -2,6 +2,7 @@ use nf_conv2d_layer, only: conv2d_layer use nf_dense_layer, only: dense_layer + use nf_dropout_layer, only: dropout_layer use nf_flatten_layer, only: flatten_layer use nf_input1d_layer, only: input1d_layer use nf_input2d_layer, only: input2d_layer @@ -141,7 +142,8 @@ module subroutine backward(self, output, loss) select type(next_layer => self % layers(n + 1) % p) type is(dense_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) - + type is(dropout_layer) + call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) type is(conv2d_layer) call self % layers(n) % backward(self % layers(n - 1), next_layer % gradient) @@ -251,19 +253,27 @@ module function predict_1d(self, input) result(res) class(network), intent(in out) :: self real, intent(in) :: input(:) real, allocatable :: res(:) - integer :: num_layers + integer :: n, num_layers num_layers = size(self % layers) + ! predict is run in inference mode only; + ! set all dropout layers' training mode to false, and + ! return to training mode after inference. + call self % set_training_mode(.false.) call self % forward(input) + call self % set_training_mode(.true.) select type(output_layer => self % layers(num_layers) % p) type is(dense_layer) res = output_layer % output + type is(dropout_layer) + res = output_layer % output type is(flatten_layer) res = output_layer % output class default - error stop 'network % output not implemented for this output layer' + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' end select end function predict_1d @@ -273,11 +283,16 @@ module function predict_2d(self, input) result(res) class(network), intent(in out) :: self real, intent(in) :: input(:,:) real, allocatable :: res(:) - integer :: num_layers + integer :: n, num_layers num_layers = size(self % layers) + ! predict is run in inference mode only; + ! set all dropout layers' training mode to false, and + ! return to training mode after inference. + call self % set_training_mode(.false.) call self % forward(input) + call self % set_training_mode(.true.) select type(output_layer => self % layers(num_layers) % p) type is(dense_layer) @@ -285,7 +300,8 @@ module function predict_2d(self, input) result(res) type is(flatten_layer) res = output_layer % output class default - error stop 'network % output not implemented for this output layer' + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' end select end function predict_2d @@ -295,11 +311,16 @@ module function predict_3d(self, input) result(res) class(network), intent(in out) :: self real, intent(in) :: input(:,:,:) real, allocatable :: res(:) - integer :: num_layers + integer :: n, num_layers num_layers = size(self % layers) + ! predict is run in inference mode only; + ! set all dropout layers' training mode to false, and + ! return to training mode after inference. + call self % set_training_mode(.false.) call self % forward(input) + call self % set_training_mode(.true.) select type(output_layer => self % layers(num_layers) % p) type is(conv2d_layer) @@ -310,7 +331,8 @@ module function predict_3d(self, input) result(res) type is(flatten_layer) res = output_layer % output class default - error stop 'network % output not implemented for this output layer' + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' end select end function predict_3d @@ -320,12 +342,17 @@ module function predict_batch_1d(self, input) result(res) class(network), intent(in out) :: self real, intent(in) :: input(:,:) real, allocatable :: res(:,:) - integer :: i, batch_size, num_layers, output_size + integer :: i, n, batch_size, num_layers, output_size num_layers = size(self % layers) batch_size = size(input, dim=rank(input)) output_size = product(self % layers(num_layers) % layer_shape) + ! predict is run in inference mode only; + ! set all dropout layers' training mode to false, and + ! return to training mode after inference. + call self % set_training_mode(.false.) + allocate(res(output_size, batch_size)) batch: do i = 1, size(res, dim=2) @@ -338,11 +365,16 @@ module function predict_batch_1d(self, input) result(res) type is(flatten_layer) res(:,i) = output_layer % output class default - error stop 'network % output not implemented for this output layer' + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' end select end do batch + ! We are now done with inference; + ! return to training mode for dropout layers. + call self % set_training_mode(.true.) + end function predict_batch_1d @@ -350,12 +382,17 @@ module function predict_batch_3d(self, input) result(res) class(network), intent(in out) :: self real, intent(in) :: input(:,:,:,:) real, allocatable :: res(:,:) - integer :: i, batch_size, num_layers, output_size + integer :: i, n, batch_size, num_layers, output_size num_layers = size(self % layers) batch_size = size(input, dim=rank(input)) output_size = product(self % layers(num_layers) % layer_shape) + ! predict is run in inference mode only; + ! set all dropout layers' training mode to false, and + ! return to training mode after inference. + call self % set_training_mode(.false.) + allocate(res(output_size, batch_size)) batch: do i = 1, batch_size @@ -371,11 +408,16 @@ module function predict_batch_3d(self, input) result(res) type is(flatten_layer) res(:,i) = output_layer % output class default - error stop 'network % output not implemented for this output layer' + error stop 'network % output not implemented for ' // & + trim(self % layers(num_layers) % name) // ' layer' end select end do batch + ! We are now done with inference; + ! return to training mode for dropout layers. + call self % set_training_mode(.true.) + end function predict_batch_3d @@ -455,6 +497,18 @@ module subroutine set_params(self, params) end subroutine set_params + module subroutine set_training_mode(self, training) + class(network), intent(in out) :: self + logical, intent(in) :: training + integer :: n + do n = 2, size(self % layers) + select type(this_layer => self % layers(n) % p); type is(dropout_layer) + this_layer % training = training + end select + end do + end subroutine set_training_mode + + module subroutine train(self, input_data, output_data, batch_size, & epochs, optimizer, loss) class(network), intent(in out) :: self diff --git a/src/nf/nf_random.f90 b/src/nf/nf_random.f90 index 57c5d11f..5160bc13 100644 --- a/src/nf/nf_random.f90 +++ b/src/nf/nf_random.f90 @@ -1,12 +1,12 @@ module nf_random - !! Provides a random number generator with - !! normal distribution, centered on zero. + !! Provides a random number generator with normal distribution, + !! centered on zero, and a Fisher-Yates shuffle. implicit none private - public :: random_normal + public :: random_normal, shuffle real, parameter :: pi = 4 * atan(1.d0) @@ -23,4 +23,22 @@ impure elemental subroutine random_normal(x) x = sqrt(- 2 * log(u(1))) * cos(2 * pi * u(2)) end subroutine random_normal + + subroutine shuffle(x) + !! Fisher-Yates shuffle. + real, intent(in out) :: x(:) + !! Array to shuffle + integer :: i, j + real :: r, temp + + do i = size(x), 2, -1 + call random_number(r) + j = floor(r * i) + 1 + temp = x(i) + x(i) = x(j) + x(j) = temp + end do + + end subroutine shuffle + end module nf_random diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 12236416..1716dc8c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,7 @@ foreach(execid input1d_layer input2d_layer input3d_layer + dropout_layer linear2d_layer parametric_activation dense_layer diff --git a/test/test_conv2d_network.f90 b/test/test_conv2d_network.f90 index 42d868df..1bdfc677 100644 --- a/test/test_conv2d_network.f90 +++ b/test/test_conv2d_network.f90 @@ -39,7 +39,7 @@ program test_conv2d_network type(network) :: cnn real :: y(1) - real :: tolerance = 1e-5 + real :: tolerance = 1e-4 integer :: n integer, parameter :: num_iterations = 1000 @@ -76,7 +76,7 @@ program test_conv2d_network type(network) :: cnn real :: x(1, 8, 8) real :: y(1) - real :: tolerance = 1e-5 + real :: tolerance = 1e-4 integer :: n integer, parameter :: num_iterations = 1000 @@ -111,7 +111,7 @@ program test_conv2d_network type(network) :: cnn real :: x(1, 12, 12) real :: y(9) - real :: tolerance = 1e-5 + real :: tolerance = 1e-4 integer :: n integer, parameter :: num_iterations = 5000 diff --git a/test/test_dropout_layer.f90 b/test/test_dropout_layer.f90 new file mode 100644 index 00000000..a79d0de5 --- /dev/null +++ b/test/test_dropout_layer.f90 @@ -0,0 +1,243 @@ +program test_dropout_layer + use iso_fortran_env, only: stderr => error_unit + use nf, only: dense, dropout, input, layer, network + use nf_dropout_layer, only: dropout_layer + type(layer) :: layer1 + type(network) :: net + integer :: input_size + + logical :: ok = .true. + + layer1 = dropout(0.5) + + if (.not. layer1 % name == 'dropout') then + ok = .false. + write(stderr, '(a)') 'dropout layer has its name set correctly.. failed' + end if + + ! Dropout on its own is not initialized and its arrays not allocated. + select type(layer1_p => layer1 % p) + type is(dropout_layer) + + if (layer1_p % dropout_rate /= 0.5) then + ok = .false. + write(stderr, '(a)') 'dropout layer dropout rate should be 0.5.. failed' + end if + + if (.not. layer1_p % training) then + ok = .false. + write(stderr, '(a)') 'dropout layer default training mode should be true.. failed' + end if + + if (layer1_p % input_size /= 0) then + print *, 'input_size: ', layer1_p % input_size + ok = .false. + write(stderr, '(a)') 'dropout layer size should be zero.. failed' + end if + + if (allocated(layer1_p % output)) then + ok = .false. + write(stderr, '(a)') 'dropout layer output array should not be allocated.. failed' + end if + + end select + + ! Now we're gonna initialize a minimal network with an input layer and a + ! dropout that follows and we'll check that the dropout layer has expected + ! state. + input_size = 10 + net = network([ & + input(input_size), & + dropout(0.5) & + ]) + + select type(layer1_p => net % layers(1) % p) + type is(dropout_layer) + if (layer1_p % input_size /= input_size) then + ok = .false. + write(stderr, '(a)') 'dropout layer input size should be the same as the input layer.. failed' + end if + + if (.not. allocated(layer1_p % output)) then + ok = .false. + write(stderr, '(a)') 'dropout layer output array should be allocated.. failed' + end if + + if (.not. allocated(layer1_p % gradient)) then + ok = .false. + write(stderr, '(a)') 'dropout layer gradient array should be allocated.. failed' + end if + + if (.not. allocated(layer1_p % mask)) then + ok = .false. + write(stderr, '(a)') 'dropout layer mask array should be allocated.. failed' + end if + + end select + + ! Test that the generated dropout mask matches the requested dropout rate. + test_mask: block + integer, parameter :: input_sizes(3) = [10, 100, 1000] + real, parameter :: dropout_rates(5) = [0., 0.2, 0.5, 0.8, 1.] + real, allocatable :: input_data(:) + integer :: i, j + + do i = 1, size(input_sizes) + do j = 1, size(dropout_rates) + + net = network([ & + input(input_sizes(i)), & + dropout(dropout_rates(j)) & + ]) + + if (allocated(input_data)) deallocate(input_data) + allocate(input_data(input_sizes(i))) + call random_number(input_data) + + call net % forward(input_data) + + select type(layer1_p => net % layers(2) % p) + type is(dropout_layer) + if (abs(sum(layer1_p % mask) / size(layer1_p % mask) - (1 - dropout_rates(j))) > 1e-6) then + ok = .false. + write(stderr, '(a)') 'actual dropout rate is equal to requested.. failed' + end if + end select + end do + end do + + end block test_mask + + + ! Now we're gonna run the forward pass and check that the dropout indeed + ! drops according to the requested dropout rate. + forward_pass: block + real :: input_data(10) + real :: output_data(size(input_data)) + real, parameter :: dropout_rate = 0.2 + real :: realized_dropout_rate + integer :: n + + net = network([ & + input(size(input_data)), & + dropout(dropout_rate) & + ]) + + do n = 1, 100 + + call random_number(input_data) + call net % forward(input_data) + + ! Check that sum of output matches sum of input within small tolerance + select type(layer1_p => net % layers(2) % p) + type is(dropout_layer) + realized_dropout_rate = 1 - sum(input_data * layer1_p % mask) / sum(layer1_p % output) + if (abs(realized_dropout_rate - dropout_rate) > 1e-6) then + ok = .false. + write(stderr, '(a)') 'realized dropout rate does not match requested dropout rate.. failed' + end if + end select + + end do + + if (.not. ok) write(stderr, '(a)') & + 'dropout layer output sum should match input sum within tolerance.. failed' + + end block forward_pass + + + training: block + real :: x(20), y(5) + real :: tolerance = 1e-4 + integer :: n + integer, parameter :: num_iterations = 100000 + + call random_number(x) + y = [0.12345, 0.23456, 0.34567, 0.45678, 0.56789] + + net = network([ & + input(20), & + dense(20), & + dropout(0.2), & + dense(5) & + ]) + + do n = 1, num_iterations + call net % forward(x) + call net % backward(y) + call net % update() + if (all(abs(net % predict(x) - y) < tolerance)) exit + end do + + if (.not. n <= num_iterations) then + write(stderr, '(a)') & + 'dense network should converge in simple training.. failed' + ok = .false. + end if + + end block training + + ! The following timing test is not part of the unit tests, but it's a good + ! way to see the performance difference between a network with and without + ! dropout. + timing: block + integer, parameter :: layer_size = 100 + integer, parameter :: num_iterations = 1000 + real :: x(layer_size), y(layer_size) + integer :: n + type(network) :: net1, net2 + real :: t1, t2 + real :: accumulated_time1 = 0 + real :: accumulated_time2 = 0 + + net1 = network([ & + input(layer_size), & + dense(layer_size), & + dense(layer_size) & + ]) + + net2 = network([ & + input(layer_size), & + dense(layer_size), & + dropout(0.5), & + dense(layer_size) & + ]) + + call random_number(y) + + ! Network without dropout + do n = 1, num_iterations + call random_number(x) + call cpu_time(t1) + call net1 % forward(x) + call net1 % backward(y) + call net1 % update() + call cpu_time(t2) + accumulated_time1 = accumulated_time1 + (t2 - t1) + end do + + ! Network with dropout + do n = 1, num_iterations + call random_number(x) + call cpu_time(t1) + call net2 % forward(x) + call net2 % backward(y) + call net2 % update() + call cpu_time(t2) + accumulated_time2 = accumulated_time2 + (t2 - t1) + end do + + ! Uncomment the following prints to see the timing results. + !print '(a, f9.6, a, f9.6, a)', 'No dropout time: ', accumulated_time1, ' seconds' + !print '(a, f9.6, a, f9.6, a)', 'Dropout time: ', accumulated_time2, ' seconds' + + end block timing + + if (ok) then + print '(a)', 'test_dropout_layer: All tests passed.' + else + write(stderr, '(a)') 'test_dropout_layer: One or more tests failed.' + stop 1 + end if + +end program test_dropout_layer