Merge pull request #13 from LuxDL/fm/fp

[WIP] Flux feature parity
LuxDL · Feb 27, 2024 · 438aaac · 438aaac · avik-pal · Feb 27, 2024
2 parents 273a0cf + 2af0c18
commit 438aaac
Show file tree

Hide file tree

Showing 5 changed files with 399 additions and 10 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "WeightInitializers"
 uuid = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "0.1.5"
+version = "0.1.6"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PartialFunctions = "570af359-4316-4cb7-8c74-252c00c2016b"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -21,6 +22,7 @@ WeightInitializersCUDAExt = "CUDA"
 Aqua = "0.8"
 CUDA = "5"
 ChainRulesCore = "1.21"
+LinearAlgebra = "1.9"
 PartialFunctions = "1.2"
 PrecompileTools = "1.2"
 Random = "1.9"

diff --git a/ext/WeightInitializersCUDAExt.jl b/ext/WeightInitializersCUDAExt.jl
@@ -1,7 +1,9 @@
 module WeightInitializersCUDAExt
 
 using WeightInitializers, CUDA
-import WeightInitializers: __partial_apply, NUM_TO_FPOINT
+using Random
+import WeightInitializers: __partial_apply, NUM_TO_FPOINT, identity_init, sparse_init,
+                           orthogonal
 
 const AbstractCuRNG = Union{CUDA.RNG, CURAND.RNG}
 
@@ -19,4 +21,59 @@ for T in ("16", "32", "64", "C16", "C32", "C64"), fname in (:ones, :zeros)
     end
 end
 
+function sparse_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+    end
+
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, T, dims...) .* std
+    sparse_array[1:num_zeros, :] .= CUDA.zero(T)
+
+    return CUDA.@allowscalar mapslices(shuffle, sparse_array, dims=1)
+end
+
+function identity_init(rng::AbstractCuRNG, ::Type{T}, dims::Integer...;
+        gain::Number=1, shift::Integer=0) where {T <: Number}
+    if length(dims) == 1
+        # Bias initialization
+        return CUDA.zeros(T, dims...)
+    elseif length(dims) == 2
+        # Matrix multiplication
+        rows, cols = dims
+        mat = CUDA.zeros(T, rows, cols)
+        diag_indices = 1:min(rows, cols)
+        CUDA.fill!(view(mat, diag_indices, diag_indices), gain)
+        return CUDA.circshift(mat, shift)
+    else
+        # Convolution or more dimensions
+        nin, nout = dims[end - 1], dims[end]
+        centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+        weights = CUDA.zeros(T, dims...)
+        #we should really find a better way to do this
+        CUDA.@allowscalar for i in 1:min(nin, nout)
+            index = (centers..., i, i)
+            weights[index...] = gain
+        end
+        return CUDA.circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift))
+    end
+end
+
+for initializer in (:sparse_init, :identity_init)
+    @eval function ($initializer)(rng::AbstractCuRNG, dims::Integer...; kwargs...)
+        return $initializer(rng, Float32, dims...; kwargs...)
+    end
+
+    @eval function ($initializer)(rng::AbstractCuRNG; kwargs...)
+        return __partial_apply($initializer, (rng, (; kwargs...)))
+    end
+    @eval function ($initializer)(rng::AbstractCuRNG,
+            ::Type{T}; kwargs...) where {T <: Number}
+        return __partial_apply($initializer, ((rng, T), (; kwargs...)))
+    end
+end
+
 end
diff --git a/src/WeightInitializers.jl b/src/WeightInitializers.jl
@@ -3,7 +3,8 @@ module WeightInitializers
 import PrecompileTools: @recompile_invalidations
 
 @recompile_invalidations begin
-    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics
+    using ChainRulesCore, PartialFunctions, Random, SpecialFunctions, Statistics,
+          LinearAlgebra
 end
 
 include("utils.jl")
@@ -14,7 +15,8 @@ for f in [
     :zeros64, :ones64, :rand64, :randn64, :zeros32, :ones32, :rand32, :randn32, :zeros16,
     :ones16, :rand16, :randn16, :zerosC64, :onesC64, :randC64, :randnC64, :zerosC32,
     :onesC32, :randC32, :randnC32, :zerosC16, :onesC16, :randC16, :randnC16, :glorot_normal,
-    :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal]
+    :glorot_uniform, :kaiming_normal, :kaiming_uniform, :truncated_normal, :orthogonal,
+    :sparse_init, :identity_init]
     @eval @non_differentiable $(f)(::Any...)
 end
 
@@ -25,5 +27,8 @@ export zerosC64, onesC64, randC64, randnC64, zerosC32, onesC32, randC32, randnC3
 export glorot_normal, glorot_uniform
 export kaiming_normal, kaiming_uniform
 export truncated_normal
+export orthogonal
+export sparse_init
+export identity_init
 
 end
diff --git a/src/initializers.jl b/src/initializers.jl
@@ -122,9 +122,216 @@ function truncated_normal(rng::AbstractRNG, ::Type{T}, dims::Integer...; mean=T(
     return xs
 end
 
+"""
+    orthogonal([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...;
+        gain = 1)  -> AbstractArray{T, length(dims)}
+
+Return an `AbstractArray{T}` of the given dimensions (`dims`) which is a
+(semi) orthogonal matrix, as described in [^Saxe14]
+
+The function constructs an orthogonal or semi-orthogonal matrix depending on the specified
+dimensions. For two dimensions, it returns a matrix where `dims = (rows, cols)`.
+For more than two dimensions, it computes an orthogonal matrix of
+size `prod(dims[1:(end - 1)])` by `dims[end]` before reshaping it to
+the original dimensions.
+
+Cannot construct a vector, i.e., `length(dims) == 1` is forbidden.
+
+# Arguments
+
+  - `rng::AbstractRNG`: Random number generator.
+  - `T::Type{<:Real}`: The type of the elements in the array.
+  - `dims::Integer...`: The dimensions of the array.
+  - `gain::Number`: Scaling factor for the elements of the orthogonal matrix.
+
+# References
+
+[^Saxe14] Saxe, McClelland, Ganguli. "Exact solutions to the nonlinear dynamics of
+learning in deep linear neural networks",
+ICLR 2014, https://arxiv.org/abs/1312.6120
+"""
+function orthogonal(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        gain::Number=T(1.0)) where {T <: Number}
+    @assert length(dims)>1 "Creating vectors (length(dims) == 1) is not allowed"
+
+    if length(dims) == 2
+        rows, cols = dims
+    else
+        rows = prod(dims[1:(end - 1)])
+        cols = dims[end]
+    end
+
+    if rows < cols
+        return permutedims(orthogonal(rng, T, cols, rows; gain))
+    end
+
+    mat = randn(rng, T, rows, cols)
+    Q, R = qr(mat)
+    mat .= Q * sign.(Diagonal(R)) .* T(gain)
+
+    if length(dims) > 2
+        return reshape(mat, dims)
+    else
+        return mat
+    end
+end
+
+"""
+    sparse_init([::AbstractRNG=_default_rng()], [T=Float32], dims::Integer...;
+        sparsity::Number, std::Number=0.01) -> AbstractArray{T}
+
+Creates a sparsely initialized weight matrix with a specified proportion of zeroed elements,
+using random numbers drawn from a normal distribution for the non-zero elements.
+This method is introduced in [^Martens2010].
+Note: The sparsity parameter controls the proportion of the matrix that will be zeroed.
+For example, a sparsity of 0.3 means that approximately 30% of the elements will be
+set to zero. The non-zero elements are distributed according to a normal distribution,
+scaled by the std parameter.
+
+# Arguments
+
+  - `rng::AbstractRNG`: The random number generator to use.
+  - `T::Type{<:Number}`: The numeric type of the elements in the returned array.
+  - `dims::Integer...`: The dimensions of the weight matrix to be generated.
+  - `sparsity::Number`: The proportion of elements to be zeroed. Must be between 0 and 1.
+  - `std::Number=0.01`: The standard deviation of the normal distribution
+    before applying `gain`.
+
+# Returns
+
+  - `AbstractArray{T}`: A sparsely initialized weight matrix of dimensions `dims`
+    and type `T`.
+
+# Examples
+
+```julia
+using Random
+
+# Initialize a 5x5 sparsely initialized matrix with 30% sparsity
+rng = MersenneTwister(123)
+matrix = sparse_init(rng, Float32, 5, 5; sparsity=0.3, std=0.01)
+```
+
+```
+5×5 Matrix{Float64}:
+  0.0          0.00273815    0.00592403   0.0          0.0
+  0.00459416  -0.000754831  -0.00888936  -0.0077507    0.0
+  0.0         -0.00194229    0.0          0.0         -0.00468489
+  0.0114265    0.0           0.0         -0.00734886   0.00277726
+ -0.00396679   0.0           0.00327215  -0.0071741   -0.00880897
+```
+
+# References
+
+[^Martens2010] Martens, J, "Deep learning via Hessian-free optimization"
+_Proceedings of the 27th International Conference on International Conference
+on Machine Learning_. 2010.
+"""
+function sparse_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        sparsity::Number, std::Number=T(0.01)) where {T <: Number}
+    if length(dims) != 2
+        throw(ArgumentError("Only 2-dimensional outputs are supported for sparse initialization."))
+    end
+
+    rows, cols = dims
+    prop_zero = min(1.0, sparsity)
+    num_zeros = ceil(Integer, prop_zero * rows)
+    sparse_array = randn(rng, T, dims...) .* std
+    sparse_array[1:num_zeros, :] .= zero(T)
+    return mapslices(shuffle, sparse_array; dims=1)
+end
+
+"""
+    identity_init([::AbstractRNG=_default_rng()], [T=Float32], size...; gain::Number=1,
+        shift::Union{Integer, Tuple{Integer, Integer}}=0) -> AbstractArray{T}
+
+Constructs an array that aims to provide an identity mapping when used as parameters in
+most layers of a neural network. The identity mapping is scaled by the `gain` parameter.
+
+# Behavior
+
+  - 1D: Returns a `Vector` of zeros (useful for biases in layers where
+    `input_size == output_size`).
+  - 2D: Returns an identity matrix
+    (useful for fully connected layers with equal input and output sizes).
+  - More than 2D: Returns a tensor where the central slice along the last
+    two dimensions is an identity matrix, and the rest are zeros
+    (useful for convolutional layers, simulating an identity convolution).
+
+# Caveats
+
+  - Not all layers will result in an identity mapping when using this initializer.
+    Exceptions include recurrent and normalization layers.
+  - Layers must have `input_size == output_size` for a perfect identity mapping.
+    In cases where this condition is not met, the function pads extra dimensions with zeros.
+  - For convolutional layers to achieve an identity mapping, kernel sizes must be odd,
+    and appropriate padding must be applied to ensure the output
+    feature maps are the same size as the input feature maps.
+
+# Arguments
+
+  - `rng::AbstractRNG`: An optional random number generator,
+    included for consistency with other initializers but ignored since the
+    output is deterministic.
+  - `T::Type{<:Number}`: The numeric type of the array elements.
+  - `size...`: The dimensions of the array to be initialized.
+  - `gain::Number=1`: A scaling factor applied to the identity mapping.
+  - `shift::Union{Integer, Tuple{Integer, Integer}}=0`: An integer or
+    a tuple specifying the circular shift applied to the output array.
+
+# Returns
+
+  - `AbstractArray{T}`: An array initialized to represent an identity mapping,
+    scaled by `gain` and optionally shifted by `shift`.
+
+# Examples
+
+```julia
+using Random
+
+# Identity matrix for fully connected layer
+identity_matrix = identity_init(MersenneTwister(123), Float32, 5, 5)
+
+# Identity tensor for convolutional layer
+identity_tensor = identity_init(MersenneTwister(123),
+    Float32,        # Bias initialization
+    3,
+    3,
+    5,        # Matrix multiplication
+    5;
+    gain=1.5,
+    shift=(1, 0))
+```
+"""
+function identity_init(rng::AbstractRNG, ::Type{T}, dims::Integer...;
+        gain::Number=1, shift::Integer=0) where {T <: Number}
+    if length(dims) == 1
+        # Bias initialization
+        return zeros(T, dims...)
+    elseif length(dims) == 2
+        # Matrix multiplication
+        rows, cols = dims
+        mat = zeros(T, rows, cols)
+        for i in 1:min(rows, cols)
+            mat[i, i] = gain
+        end
+        return circshift(mat, shift)
+    else
+        # Convolution or more dimensions
+        nin, nout = dims[end - 1], dims[end]
+        centers = map(d -> cld(d, 2), dims[1:(end - 2)])
+        weights = zeros(T, dims...)
+        for i in 1:min(nin, nout)
+            index = (centers..., i, i)
+            weights[index...] = gain
+        end
+        return circshift(weights, (ntuple(d -> 0, length(dims) - 2)..., shift, shift))
+    end
+end
+
 # Default Fallbacks for all functions
 for initializer in (:glorot_uniform, :glorot_normal, :kaiming_uniform, :kaiming_normal,
-    :truncated_normal)
+    :truncated_normal, :orthogonal, :sparse_init, :identity_init)
     NType = ifelse(initializer === :truncated_normal, Real, Number)
     @eval function ($initializer)(dims::Integer...; kwargs...)
         return $initializer(_default_rng(), Float32, dims...; kwargs...)