LuxDL · avik-pal · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/Project.toml b/Project.toml
@@ -33,7 +33,6 @@ Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 
 [weakdeps]
@@ -88,8 +87,8 @@ GPUArraysCore = "0.1.6, 0.2"
 LinearAlgebra = "1.10"
 LossFunctions = "0.11.1"
 LuxCore = "1"
-LuxLib = "1.3"
-MLDataDevices = "1.2"
+LuxLib = "1.3.4"
+MLDataDevices = "1.3"
 MLUtils = "0.4.4"
 MPI = "0.20.19"
 MacroTools = "0.5.13"
@@ -109,7 +108,6 @@ Static = "1.1.1"
 StaticArraysCore = "1.4.3"
 Statistics = "1.10"
 Tracker = "0.2.34"
-VectorizationBase = "0.21.70"
 WeightInitializers = "1"
 Zygote = "0.6.70"
 julia = "1.10"
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -43,9 +43,9 @@ Literate = "2.18.0"
 Lux = "1"
 LuxCUDA = "0.3.2"
 LuxCore = "1"
-LuxLib = "1"
+LuxLib = "1.3.4"
 LuxTestUtils = "1.2"
-MLDataDevices = "1.2"
+MLDataDevices = "1.3"
 Optimisers = "0.3.3"
 Pkg = "1.10"
 Printf = "1.10"

diff --git a/docs/src/manual/performance_pitfalls.md b/docs/src/manual/performance_pitfalls.md
@@ -83,6 +83,18 @@ Prefer to use deep learning primitives and their fused variants from `LuxLib.jl`
 5. Replace uses of `σ.(x .+ b)` with [`LuxLib.bias_activation`](@ref) or
    [`LuxLib.bias_activation!!`](@ref) (the latter one is often faster).
 
+## Optional Dependencies for Performance
+
+For faster performance on CPUs load the following packages:
+
+1. `LoopVectorization.jl`
+2. `Octavian.jl`
+
+If these are available, we automatically use optimized versions of the layers. Though there
+are cases where this might be an issue (see
+[#980](https://github.com/LuxDL/Lux.jl/issues/980) and
+[disabling loop vectorization](@ref disable_loop_vectorization)).
+
 ## Data Loading and Device Transfer
 
 A common pattern for loading data and transferring data to GPUs looks like this:

diff --git a/docs/src/manual/preferences.md b/docs/src/manual/preferences.md
@@ -13,6 +13,8 @@
 
     ```julia
     pkg> preference add Lux <preference-name>=<value>
+    pkg> preference add LuxLib <preference-name>=<value>
+    pkg> preference add LuxCore <preference-name>=<value>
     ```
 
 Lux.jl relies on several preferences to make decision on how to run your code. Here is an
@@ -57,3 +59,12 @@ By default, both of these preferences are set to `false`.
    - Setting the `LuxLib` preference sets the check at the level of functional layer of
      Lux, for example, [`fused_dense_bias_activation`](@ref). These functions are supposed
      to be type stable for common input types and can be used to guarantee type stability.
+
+## [Disabling Loop Vectorization / Octavian](@id disable_loop_vectorization)
+
+`LoopVectorization.jl` and `Octavian.jl` are optional dependencies that are used to
+accelerate certain CPU operations. However, these packages are tightly coupled with julia
+and might not work with all julia versions and systems. If these packages are loaded in any
+form LuxLib will use the optimized versions of the functions. But it might be desirable to
+disable these packages and use the default implementations instead. This can be done by
+setting the `disable_loop_vectorization` preference to `true` for `LuxLib`.
diff --git a/examples/Basics/main.jl b/examples/Basics/main.jl
@@ -95,7 +95,8 @@ W = randn(5, 10)
 x = rand(10)
 W * x
 
-# Julia's arrays are very powerful, and you can learn more about what they can do [here](https://docs.julialang.org/en/v1/manual/arrays/).
+# Julia's arrays are very powerful, and you can learn more about what they can do
+# [here](https://docs.julialang.org/en/v1/manual/arrays/).
 
 # ### CUDA Arrays
 
@@ -206,18 +207,20 @@ println("Computed Gradient via Forward Mode AD (ForwardDiff): ", ForwardDiff.gra
 # ### Jacobian-Vector Product
 
 # I will defer the discussion on forward-mode AD to
-# [https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/](https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/). Here let us just look
-# at a mini example on how to use it.
+# [https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/](https://book.sciml.ai/notes/08-Forward-Mode_Automatic_Differentiation_(AD)_via_High_Dimensional_Algebras/).
+# Here let us just look at a mini example on how to use it.
 
 f(x) = x .* x ./ 2
 x = randn(rng, Float32, 5)
 v = ones(Float32, 5)
 
 # Construct the pushforward function. We will write out the function here but in
-# practice we recommend using [SparseDiffTools.auto_jacvec](https://docs.sciml.ai/SparseDiffTools/stable/#Jacobian-Vector-and-Hessian-Vector-Products)!
+# practice we recommend using
+# [SparseDiffTools.auto_jacvec](https://docs.sciml.ai/SparseDiffTools/stable/#Jacobian-Vector-and-Hessian-Vector-Products)!
 
 # First we need to create a Tag for ForwardDiff. It is enough to know that this is something
-# that you must do. For more details, see the [ForwardDiff Documentation](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Custom-tags-and-tag-checking)!
+# that you must do. For more details, see the
+# [ForwardDiff Documentation](https://juliadiff.org/ForwardDiff.jl/dev/user/advanced/#Custom-tags-and-tag-checking)!
 struct TestTag end
 
 # Going in the details of what is function is doing is beyond the scope of this tutorial.

diff --git a/examples/BayesianNN/main.jl b/examples/BayesianNN/main.jl
@@ -20,7 +20,9 @@ Turing.setprogress!(true);
 
 # ## Generating data
 
-# Our goal here is to use a Bayesian neural network to classify points in an artificial dataset. The code below generates data points arranged in a box-like pattern and displays a graph of the dataset we'll be working with.
+# Our goal here is to use a Bayesian neural network to classify points in an artificial
+# dataset. The code below generates data points arranged in a box-like pattern and displays
+# a graph of the dataset we'll be working with.
 
 ## Number of points to generate
 N = 80

diff --git a/examples/HyperNet/main.jl b/examples/HyperNet/main.jl
@@ -81,10 +81,10 @@ function train()
     rng = Xoshiro(0)
     ps, st = Lux.setup(rng, model) |> dev
 
-    train_state = Training.TrainState(model, ps, st, Adam(3.0f-4))
+    train_state = Training.TrainState(model, ps, st, Adam(0.001f0))
 
     ### Lets train the model
-    nepochs = 25
+    nepochs = 50
     for epoch in 1:nepochs, data_idx in 1:2
         train_dataloader, test_dataloader = dataloaders[data_idx] .|> dev
 
@@ -106,8 +106,8 @@ function train()
 
         data_name = data_idx == 1 ? "MNIST" : "FashionMNIST"
 
-        @printf "[%3d/%3d] \t %12s \t Time %.5fs \t Training Accuracy: %.2f%% \t Test \
-                 Accuracy: %.2f%%\n" epoch nepochs data_name ttime train_acc test_acc
+        @printf "[%3d/%3d]\t%12s\tTime %3.5fs\tTraining Accuracy: %3.2f%%\tTest \
+                 Accuracy: %3.2f%%\n" epoch nepochs data_name ttime train_acc test_acc
     end
 
     println()
@@ -126,13 +126,13 @@ function train()
 
         data_name = data_idx == 1 ? "MNIST" : "FashionMNIST"
 
-        @printf "[FINAL] \t %12s \t Training Accuracy: %.2f%% \t Test Accuracy: \
-                 %.2f%%\n" data_name train_acc test_acc
+        @printf "[FINAL]\t%12s\tTraining Accuracy: %3.2f%%\tTest Accuracy: \
+                 %3.2f%%\n" data_name train_acc test_acc
         test_acc_list[data_idx] = test_acc
     end
     return test_acc_list
 end
 
 test_acc_list = train()
-@assert test_acc_list[1] > 0.90 && test_acc_list[2] > 0.70 #hide
+@assert test_acc_list[1] > 60 && test_acc_list[2] > 60 #hide
 nothing #hide
diff --git a/examples/ImageNet/Project.toml b/examples/ImageNet/Project.toml
@@ -33,7 +33,7 @@ ImageMagick = "1"
 JLD2 = "0.5.1"
 Lux = "1"
 LuxCUDA = "0.3.3"
-MLDataDevices = "1"
+MLDataDevices = "1.3"
 MLUtils = "0.4.4"
 MPI = "0.20.21"
 NCCL = "0.1.1"

diff --git a/examples/NeuralODE/Project.toml b/examples/NeuralODE/Project.toml
@@ -7,7 +7,7 @@ MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
-OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+OrdinaryDiffEqTsit5 = "b1df2697-797e-41e3-8120-5422d3b24e4a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
@@ -17,12 +17,12 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 ComponentArrays = "0.15"
 Lux = "1"
-LuxCUDA = "0.2, 0.3"
-MLDatasets = "0.5, 0.7"
-MLUtils = "0.2, 0.3, 0.4"
-OneHotArrays = "0.1, 0.2"
-Optimisers = "0.2, 0.3"
-OrdinaryDiffEq = "6"
+LuxCUDA = "0.3"
+MLDatasets = "0.7"
+MLUtils = "0.4"
+OneHotArrays = "0.2"
+Optimisers = "0.3"
+OrdinaryDiffEqTsit5 = "1"
 SciMLSensitivity = "7.63"
 Statistics = "1"
 Zygote = "0.6"

diff --git a/examples/NeuralODE/main.jl b/examples/NeuralODE/main.jl
@@ -7,8 +7,8 @@
 
 # ## Package Imports
 
-using Lux, ComponentArrays, SciMLSensitivity, LuxCUDA, Optimisers, OrdinaryDiffEq, Random,
-      Statistics, Zygote, OneHotArrays, InteractiveUtils, Printf
+using Lux, ComponentArrays, SciMLSensitivity, LuxCUDA, Optimisers, OrdinaryDiffEqTsit5,
+      Random, Statistics, Zygote, OneHotArrays, InteractiveUtils, Printf
 using MLDatasets: MNIST
 using MLUtils: DataLoader, splitobs
 
@@ -139,9 +139,9 @@ function train(model_function; cpu::Bool=false, kwargs...)
         end
         ttime = time() - stime
 
-        tr_acc = accuracy(model, tstate.parameters, tstate.states, train_dataloader)
-        te_acc = accuracy(model, tstate.parameters, tstate.states, test_dataloader)
-        @printf "[%d/%d] \t Time %.2fs \t Training Accuracy: %.5f%% \t Test \
+        tr_acc = accuracy(model, tstate.parameters, tstate.states, train_dataloader) * 100
+        te_acc = accuracy(model, tstate.parameters, tstate.states, test_dataloader) * 100
+        @printf "[%d/%d]\tTime %.4fs\tTraining Accuracy: %.5f%%\tTest \
                  Accuracy: %.5f%%\n" epoch nepochs ttime tr_acc te_acc
     end
 end

diff --git a/src/helpers/size_propagator.jl b/src/helpers/size_propagator.jl
@@ -9,7 +9,6 @@ using Static: Static, StaticBool
 # We need these to avoid ambiguities
 using SIMDTypes: SIMDTypes
 using StaticArraysCore: StaticArraysCore
-using VectorizationBase: VectorizationBase
 
 const VecT = Union{Bool, Float16, Float32, Float64, Int16, Int32, Int64,
     Int8, UInt16, UInt32, UInt64, UInt8, SIMDTypes.Bit}
@@ -43,17 +42,6 @@ function Base.convert(::Type{ForwardDiff.Dual{T, V, Tag}}, ::Nil) where {T, V, T
     throw(ArgumentError(NIL_DUAL_ERROR_MSG))
 end
 
-const NIL_VEC_ERROR_MSG = "`Nil` is incompatible with `VectorizationBase` numbers."
-
-VectorizationBase.Vec{W, T}(::Nil) where {T, W} = throw(ArgumentError(NIL_VEC_ERROR_MSG))
-function VectorizationBase.VecUnroll{
-        N, W, T, V}(::Nil) where {T, W, V <: VectorizationBase.AbstractSIMDVector{W, T}, N}
-    throw(ArgumentError(NIL_VEC_ERROR_MSG))
-end
-function VectorizationBase.VecUnroll{N, 1, T, T}(::Nil) where {T <: VecT, N}
-    throw(ArgumentError(NIL_VEC_ERROR_MSG))
-end
-
 const NIL_STATIC_ERROR_MSG = "`Nil` is incompatible with `Static` numbers."
 
 function Base.convert(::Type{Nil},

diff --git a/test/Project.toml b/test/Project.toml
@@ -14,12 +14,14 @@ Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -55,12 +57,14 @@ Hwloc = "3.2.0"
 InteractiveUtils = "<0.0.1, 1"
 LinearAlgebra = "1.10"
 Logging = "1.10"
+LoopVectorization = "0.12.171"
 LuxCore = "1.0"
-LuxLib = "1.3"
+LuxLib = "1.3.4"
 LuxTestUtils = "1.3"
-MLDataDevices = "1.1"
+MLDataDevices = "1.3"
 MLUtils = "0.4.3"
 NNlib = "0.9.24"
+Octavian = "0.3.28"
 OneHotArrays = "0.2.5"
 Optimisers = "0.3.3"
 Pkg = "1.10"

diff --git a/test/qa_tests.jl b/test/qa_tests.jl
@@ -10,7 +10,7 @@
     Aqua.test_piracies(Lux; treat_as_own=[Lux.outputsize])
 end
 
-@testitem "Explicit Imports: Quality Assurance" setup=[SharedTestSetup] tags=[:others] begin
+@testitem "Explicit Imports: Quality Assurance" tags=[:others] begin
     # Load all trigger packages
     import Lux, ComponentArrays, ReverseDiff, SimpleChains, Tracker, Zygote, Enzyme
     using ExplicitImports

diff --git a/test/reactant/loss_tests.jl b/test/reactant/loss_tests.jl
@@ -1,4 +1,4 @@
-@testitem "Compiled Loss Functions" tags=[:reactant] setup=[SharedTestSetup] begin
+@testitem "Compiled Loss Functions" tags=[:reactant] setup=[SharedTestSetup] skip=:(Sys.iswindows()) begin
     using Reactant, Lux, OneHotArrays
 
     rng = StableRNG(123)

diff --git a/test/reactant/training_tests.jl b/test/reactant/training_tests.jl
@@ -1,4 +1,4 @@
-@testitem "Reactant: Training API" tags=[:reactant] setup=[SharedTestSetup] begin
+@testitem "Reactant: Training API" tags=[:reactant] setup=[SharedTestSetup] skip=:(Sys.iswindows()) begin
     using Reactant, Optimisers
 
     @testset "$(mode)" for (mode, atype, dev, ongpu) in MODES

diff --git a/test/shared_testsetup.jl b/test/shared_testsetup.jl
@@ -13,6 +13,8 @@ using MLDataDevices: default_device_rng, CPUDevice, CUDADevice, AMDGPUDevice
 using LuxTestUtils: check_approx
 using Static: True
 
+using Octavian, LoopVectorization
+
 LuxTestUtils.jet_target_modules!(["Lux", "LuxCore", "LuxLib"])
 LinearAlgebra.BLAS.set_num_threads(Threads.nthreads())