diff --git a/Android.bp b/Android.bp
index d6516fec72..159aebb516 100644
--- a/Android.bp
+++ b/Android.bp
@@ -1025,11 +1025,14 @@ cc_library_static {
         "src/runtime/experimental/operators/CpuActivation.cpp",
         "src/runtime/experimental/operators/CpuAdd.cpp",
         "src/runtime/experimental/operators/CpuDepthwiseConv2d.cpp",
+        "src/runtime/experimental/operators/CpuDequantize.cpp",
         "src/runtime/experimental/operators/CpuElementwise.cpp",
+        "src/runtime/experimental/operators/CpuGEMMLowp.cpp",
         "src/runtime/experimental/operators/CpuGemm.cpp",
         "src/runtime/experimental/operators/CpuGemmConv2d.cpp",
         "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/experimental/operators/CpuMul.cpp",
+        "src/runtime/experimental/operators/CpuQuantize.cpp",
         "src/runtime/experimental/operators/CpuSoftmax.cpp",
         "src/runtime/experimental/operators/CpuSub.cpp",
         "src/runtime/experimental/operators/CpuTranspose.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 321a83bfbb..5a31e61a76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 42.0.0
+  VERSION 43.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)
diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt
new file mode 100644
index 0000000000..e45f145de6
--- /dev/null
+++ b/LICENSES/Apache-2.0.txt
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2008-2023 The Khronos Group Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE b/LICENSES/MIT.txt
similarity index 82%
rename from LICENSE
rename to LICENSES/MIT.txt
index 781685ab31..ed43132fe0 100644
--- a/LICENSE
+++ b/LICENSES/MIT.txt
@@ -1,6 +1,11 @@
-MIT License
+# SPDX-FileCopyrightText: 2012-2017 Christian Rau
+# SPDX-FileCopyrightText: 2017 Leon Merten Lohse
+# SPDX-FileCopyrightText: 2017 Sean Barrett
+# SPDX-FileCopyrightText: 2017-2024 Arm Limited
+#
+# SPDX-License-Identifier: MIT
 
-Copyright (c) 2017-2024 Arm Limited
+MIT License
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 97ffe318c4..a5387961b4 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.09-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.11-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.09-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/index.xhtml)
+[![Documentation](https://img.shields.io/badge/documentation-24.11-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/index.xhtml)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-armv7a-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-android-aarch64-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.11/arm_compute-v24.11-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.09-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.09)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.11-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.11)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
@@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract
 
 ## Experimental builds
 
-**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/how_to_build.xhtml) for more details.
+**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/how_to_build.xhtml) for more details.
 
 <br>
 
 ## How to contribute
 
-Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/contribution_guidelines.xhtml).
+Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.11/contribution_guidelines.xhtml).
 
 ### Developer Certificate of Origin (DCO)
 Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/)
diff --git a/SConscript b/SConscript
index 2aff67d8ca..784db8edcb 100644
--- a/SConscript
+++ b/SConscript
@@ -33,8 +33,8 @@ import codecs
 import platform
 import SCons
 
-VERSION = "v24.09"
-LIBRARY_VERSION_MAJOR = 42
+VERSION = "v24.11"
+LIBRARY_VERSION_MAJOR = 43
 LIBRARY_VERSION_MINOR = 0
 LIBRARY_VERSION_PATCH = 0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
@@ -627,12 +627,8 @@ custom_operators = []
 custom_types = []
 custom_layouts = []
 
-use_custom_ops = env['high_priority'] or env['build_config']
+use_custom_ops = env['build_config']
 
-if env['high_priority']:
-    custom_operators = filelist['high_priority']
-    custom_types = ['all']
-    custom_layouts = ['all']
 
 if env['build_config']:
     custom_operators, custom_types, custom_layouts = read_build_config_json(env['build_config'])
diff --git a/SConstruct b/SConstruct
index c4bfef826d..8d7bd291e8 100644
--- a/SConstruct
+++ b/SConstruct
@@ -116,7 +116,6 @@ vars.AddVariables(
     PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathAccept),
     PathVariable("install_dir", "Specify sub-folder for the install", "", PathVariable.PathAccept),
     BoolVariable("exceptions", "Enable/disable C++ exception support", True),
-    BoolVariable("high_priority", "Generate a library containing only the high priority operators", False),
     PathVariable("linker_script", "Use an external linker script", "", PathVariable.PathAccept),
     PathVariable("external_tests_dir", """Add examples, benchmarks and tests to the tests suite from an external path. In order to use this option, the external tests directory must have the following structure:
     EXTERNAL_TESTS_DIR:
@@ -519,21 +518,11 @@ if not GetOption("help"):
             # Thus for backward compatibility, we include this flag only for NDK < r23
             env.Append(CXXFLAGS = ['-no-integrated-as'])
 
-if env['high_priority'] and env['build_config']:
-    print("The high priority library cannot be built in conjunction with a user-specified build configuration")
-    Exit(1)
-
-if not env['high_priority'] and not env['build_config']:
-    env.Append(CPPDEFINES = ['ARM_COMPUTE_GRAPH_ENABLED'])
-
 data_types = []
 data_layouts = []
 
 # Set correct data types / layouts to build
-if env['high_priority']:
-    data_types = ['all']
-    data_layouts = ['all']
-elif env['build_config']:
+if env['build_config']:
     data_types, data_layouts = read_build_config_json(env['build_config'])
 else:
     data_types = env['data_type_support']
@@ -613,7 +602,9 @@ else:
         env.Append(CXXFLAGS = ['-O3'])
     else:
         # on windows we use clang-cl which does not support the option -O3
-        env.Append(CXXFLAGS = ['-O2'])
+        if not version_at_least(compiler_ver, '17.0.0'):
+            # Disable optimizations in clang 17 or later because the compiler crashes with -O2
+            env.Append(CXXFLAGS = ['-O2'])
 
 if env['asserts']:
     env.Append(CPPDEFINES = ['ARM_COMPUTE_ASSERTS_ENABLED'])
@@ -653,7 +644,7 @@ Export('version_at_least')
 
 SConscript('./SConscript', variant_dir=build_path, duplicate=0)
 
-if env['examples'] and (env['build_config'] or env['high_priority']):
+if env['examples'] and env['build_config']:
     print("WARNING: Building examples for selected operators not supported. Use examples=0")
     Return()
 
@@ -664,7 +655,7 @@ if env['examples'] and env['exceptions']:
     SConscript('./examples/SConscript', variant_dir='%s/examples' % build_path, duplicate=0)
 
 if env['exceptions']:
-    if env['build_config'] or env['high_priority']:
+    if env['build_config']:
         print("WARNING: Building tests for selected operators not supported")
         Return()
     if env['os'] == 'bare_metal' and env['arch'] == 'armv7a':
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index 7ee144e2cb..f4ad79e32c 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 
+#include <cstdint>
 #include <memory>
 
 namespace arm_compute
@@ -180,7 +181,12 @@ class CPUInfo final
      *
      * @return Vector length if sme2 is enabled, otherwise returns 0.
      */
-    uint64_t get_sme2_vector_length() const;
+    uint64_t get_sme2_vector_length_in_bytes() const;
+    /** Return the vector length in bits for sme2
+     *
+     * @return Vector length if sme2 is enabled, otherwise returns 0.
+     */
+    uint64_t get_sme2_vector_length_in_bits() const;
 
 private:
     struct Impl;
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index aecba3712e..e8cc98f9e4 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -63,6 +63,31 @@ struct UniformQuantizationInfo
     int32_t offset;
 };
 
+/** Quantization info when assuming per layer quantization */
+struct UniformRequantizationInfo
+{
+    /** Default constructor */
+    UniformRequantizationInfo() : scale(0.f), offset(0.f)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] scale  Quantization scale
+     * @param[in] offset Quantization offset
+     */
+    UniformRequantizationInfo(float scale, float offset) : scale(scale), offset(offset)
+    {
+    }
+    /** Checks if the scale and offset are both zero */
+    bool empty() const
+    {
+        return (scale == 0) && (offset == 0);
+    }
+
+    float scale;
+    float offset;
+};
+
 /** Quantization information */
 class QuantizationInfo
 {
@@ -232,6 +257,13 @@ struct Qasymm8QuantizationHelper
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
     }
 
+    static inline QUANTIZED_TYPE quantize(float value, const UniformRequantizationInfo &qinfo)
+    {
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+        const int quantized = support::cpp11::lround(value / qinfo.scale + qinfo.offset);
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
     /** Quantize a value given a 8-bit asymmetric quantization scheme using a specific rounding policy
      *
      * @param[in] value           Value to quantize
@@ -253,6 +285,21 @@ struct Qasymm8QuantizationHelper
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
     }
 
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformRequantizationInfo &qinfo, RoundingPolicy rounding_policy)
+    {
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        {
+            return quantize(value, qinfo);
+        }
+
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+
+        // We round after adding the offset, because the offset is also float
+        const int quantized = arm_compute::round(value / qinfo.scale + qinfo.offset, rounding_policy);
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
     /** Quantize a value given a 8-bit asymmetric quantization scheme
      *
      * @param[in] value           Value to quantize
@@ -588,7 +635,11 @@ inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo)
     return dequantize_s32(value, qinfo.uniform());
 }
 
-/*
+/** Compute the requantization offset and scale
+ *
+ * @deprecated because reequantization using integer offsets creates rounding issues.
+ * Please use @ref arm_compute::compute_requantization_scale_float_offset() instead.
+ *
  * In case of requantization of a quantized input tensor to an output tensor with another quantization
  * instead of applying dequantization and then a quantization functions, we just compute new scale and
  * offset.
@@ -628,9 +679,32 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform
     // In order to minimize flooring we convert the offset to a float,
     // then compute the new offset in the float domain,
     // finally we convert it back as int32_t
-    offset_to_apply -= static_cast<int32_t>(static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale);
+
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP;
+#endif //__aarch64__
+
+    offset_to_apply -=
+        arm_compute::round(static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale, rounding_policy);
     return UniformQuantizationInfo(scale_to_apply, offset_to_apply);
 }
 
+/** Similar to @ref arm_compute::compute_requantization_scale_offset()
+ *  but returning offset as float instead of integer
+ */
+inline UniformRequantizationInfo compute_requantization_scale_float_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                           const UniformQuantizationInfo &uqinfo_out)
+{
+    float scale_to_apply  = uqinfo_out.scale;
+    float offset_to_apply = static_cast<float>(uqinfo_out.offset);
+
+    scale_to_apply /= uqinfo_in.scale;
+    offset_to_apply -= static_cast<float>(uqinfo_in.offset) * uqinfo_in.scale / uqinfo_out.scale;
+
+    return UniformRequantizationInfo(scale_to_apply, offset_to_apply);
+}
+
 } // namespace arm_compute
 #endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 6c93ff0c39..e4c9cbe879 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -327,6 +327,9 @@ class TensorInfo final : public ITensorInfo
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
+     *
+     * @note When interpreting the required_strides in the return value, only the values up to the corresponding dimension in the tensor is
+     *       valid. For example, 1D tensor should only refer to 1D in required_strides, 2D tensor up to 2D in required_strides, and so on.
      *
      * @param[in] padding Padding around the XY plane in elements.
      */
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
index b19a3dd1e7..86adb761ac 100644
--- a/arm_compute/core/utils/DataTypeUtils.h
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -69,6 +69,35 @@ inline size_t data_size_from_type(DataType data_type)
     }
 }
 
+/** Get underlying data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the underlying data type
+ */
+inline constexpr DataType get_underlying_data_type(DataType data_type)
+{
+    switch (data_type)
+    {
+        case DataType::U8:
+        case DataType::QASYMM8:
+            return DataType::U8;
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            return DataType::S8;
+        case DataType::U16:
+        case DataType::QASYMM16:
+            return DataType::U16;
+        case DataType::S16:
+        case DataType::QSYMM16:
+            return DataType::S16;
+        default:
+            return data_type;
+    }
+}
+
 /** The size in bytes of the data type
  *
  * @param[in] dt Input data type
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
index 83b12d572e..575c1498ac 100644
--- a/arm_compute/function_info/ActivationLayerInfo.h
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -65,6 +65,7 @@ class ActivationLayerInfo
 
     /** Lookup table  */
 #ifdef __aarch64__
+    // TODO (COMPMID-7511): delegate to LUTManager
     using LookupTable256   = std::array<qasymm8_t, 256>;
     using LookupTable65536 = std::array<float16_t, 65536>;
 #endif // __aarch64__
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 0d8f53fe02..57c6408ef4 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCAST_H
 #define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCAST_H
 
@@ -35,7 +36,7 @@ class CLCompileContext;
 class ICLTensor;
 class ITensorInfo;
 
-/** Basic function to run opencl::kernels::ClCastKernel */
+/** Basic function to run type cast operation */
 class CLCast : public IFunction
 {
 public:
@@ -52,28 +53,36 @@ class CLCast : public IFunction
     /** Default move assignment operator */
     CLCast &operator=(CLCast &&);
     /** Initialize the function's source, destination
+     *
+     * @note When casting from/to quantized types the scale and zeroPoint are ignored
      *
      * Valid data layouts:
      * - All
      *
      * Valid data type configurations:
-     * |src            |dst                                    |
-     * |:--------------|:--------------------------------------|
-     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
-     * |S8             | U8, U16, S16, U32, S32, F16, F32      |
-     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
-     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
-     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
-     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
-     * |U64            | U8, S8, U16, S16, U32, S32, F16, F32  |
-     * |S64            | U8, S8, U16, S16, U32, S32, F16, F32  |
-     * |F16            | U8, S8, U16, S16, S32, U32, F32       |
-     * |F32            | U8, S8, U16, S16, S32, U32, F16       |
+     * |src                |dst                                                                                             |
+     * |:------------------|:-----------------------------------------------------------------------------------------------|
+     * |U8                 | S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized |
+     * |S8                 | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized                                    |
+     * |U16                | U8, S8, S16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16                                      |
+     * |S16                | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16                                     |
+     * |U32                | U8, S8, U16, S16, S32, F16, F32, All Quantized                                                 |
+     * |S32                | U8, S8, U16, S16, U32, F16, F32, All Quantized                                                 |
+     * |U64                | U8, S8, U16, S16, U32, S32, F16, F32, All Quantized                                            |
+     * |S64                | U8, S8, U16, S16, U32, S32, F16, F32, All Quantized                                            |
+     * |F16                | U8, S8, U16, S16, S32, U32, F32, All Quantized                                                 |
+     * |F32                | U8, S8, U16, S16, S32, U32, F16, All Quantized                                                 |
+     * |QASYMM8            | S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized |
+     * |QASYMM8_SIGNED     | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized                                    |
+     * |QSYMM8             | U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized                                    |
+     * |QSYMM8_PER_CHANNEL | U8, U16, S16, U32, S32, F16, F32, 16-bit Quantized                                             |
+     * |QASYMM16           | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16                                      |
+     * |QSYMM16            | U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16                                     |
      *
      * Input data type must be different than output data type.
      *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
-     * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input  The input tensor to convert.
+     * @param[out] output The output tensor.
      * @param[in]  policy Conversion policy.
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
@@ -82,14 +91,11 @@ class CLCast : public IFunction
     configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCast
      *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in] policy Conversion policy.
+     * Similar to @ref CLCast::configure()
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy);
-
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index c3ef2932f1..5025b5eaf4 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -40,7 +41,10 @@ class CPPBoxWithNonMaximaSuppressionLimit : public IFunction
 {
 public:
     /** Constructor */
-    CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager);
+    CPPBoxWithNonMaximaSuppressionLimit() : CPPBoxWithNonMaximaSuppressionLimit(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CPPBoxWithNonMaximaSuppressionLimit(const CPPBoxWithNonMaximaSuppressionLimit &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 93ea3d2c72..35da650857 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MEMORYGROUP_H
-#define ARM_COMPUTE_MEMORYGROUP_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H
+#define ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <cstddef>
 #include <memory>
@@ -66,10 +68,11 @@ class MemoryGroup final : public IMemoryGroup
     std::shared_ptr<IMemoryManager> _memory_manager; /**< Memory manager to be used by the group */
     IMemoryPool                    *_pool;           /**< Memory pool that the group is scheduled with */
     MemoryMappings                  _mappings;       /**< Memory mappings of the group */
+    bool                            _auto_clear;     /**< Whether the memory manager will be auto-cleared on release */
 };
 
 inline MemoryGroup::MemoryGroup(std::shared_ptr<IMemoryManager> memory_manager) noexcept
-    : _memory_manager(memory_manager), _pool(nullptr), _mappings()
+    : _memory_manager(memory_manager), _pool(nullptr), _mappings(), _auto_clear(false)
 {
 }
 
@@ -104,6 +107,17 @@ inline void MemoryGroup::acquire()
     if (!_mappings.empty())
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
+        // If the caller has not populated the underlying memory manager,
+        // do it here. Also set flag to auto-clear the memory manager on release.
+        // This is needed when using default memory managers that were not set up
+        // by the user.
+        if (_memory_manager->pool_manager()->num_pools() == 0)
+        {
+            Allocator alloc{};
+            _memory_manager->populate(alloc, 1);
+            _auto_clear = true;
+        }
+
         _pool = _memory_manager->pool_manager()->lock_pool();
         _pool->acquire(_mappings);
     }
@@ -118,6 +132,12 @@ inline void MemoryGroup::release()
         _pool->release(_mappings);
         _memory_manager->pool_manager()->unlock_pool(_pool);
         _pool = nullptr;
+
+        if (_auto_clear)
+        {
+            _memory_manager->clear();
+            _auto_clear = false;
+        }
     }
 }
 
@@ -126,4 +146,4 @@ inline MemoryMappings &MemoryGroup::mappings()
     return _mappings;
 }
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_MEMORYGROUP_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYGROUP_H
diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index 7c31fe7f5a..0192f0b641 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
-#define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H
+#define ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H
 
 #include "arm_compute/runtime/ILifetimeManager.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
@@ -49,6 +49,8 @@ class MemoryManagerOnDemand : public IMemoryManager
     /** Allow instances of this class to be moved */
     MemoryManagerOnDemand &operator=(MemoryManagerOnDemand &&) = default;
 
+    static std::shared_ptr<MemoryManagerOnDemand> make_default();
+
     // Inherited methods overridden:
     ILifetimeManager *lifetime_manager() override;
     IPoolManager     *pool_manager() override;
@@ -60,4 +62,4 @@ class MemoryManagerOnDemand : public IMemoryManager
     std::shared_ptr<IPoolManager>     _pool_mgr;     /**< Memory pool manager */
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYMANAGERONDEMAND_H
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index f8a4898281..4922edc2e1 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
-#define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H
+#define ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -100,4 +101,4 @@ class MemoryRegion final : public IMemoryRegion
     void                    *_ptr;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_MEMORY_REGION_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_MEMORYREGION_H
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index d58f7dda3e..88cedc6724 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
 namespace arm_compute
@@ -48,7 +49,10 @@ class NEArgMinMaxLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEArgMinMaxLayer() : NEArgMinMaxLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEArgMinMaxLayer(const NEArgMinMaxLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index a6c0cfa7fa..bd049df4af 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -29,6 +29,7 @@
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -73,7 +74,10 @@ class NEConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEConvolutionLayer() : NEConvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEConvolutionLayer(const NEConvolutionLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index aabe42f928..0c6bb1d2bf 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
@@ -74,7 +75,10 @@ class NEDeconvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEDeconvolutionLayer() : NEDeconvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete;
     /** Default move constructor */
@@ -166,4 +170,4 @@ class NEDeconvolutionLayer : public IFunction
     bool               _do_upsampling;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDECONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 1b0eb8fc3e..76250a6aee 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
@@ -43,7 +44,10 @@ class NEDepthwiseConvolutionLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEDepthwiseConvolutionLayer() : NEDepthwiseConvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDepthwiseConvolutionLayer(const NEDepthwiseConvolutionLayer &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index 7a94833d10..b789cc0579 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
-#define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -47,7 +48,10 @@ class NEDetectionPostProcessLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEDetectionPostProcessLayer() : NEDetectionPostProcessLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDetectionPostProcessLayer(const NEDetectionPostProcessLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -119,4 +123,4 @@ class NEDetectionPostProcessLayer : public IFunction
     bool   _run_dequantize;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDETECTIONPOSTPROCESSLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index a8d930d9ba..ce554d494e 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -49,7 +50,10 @@ class NEFFT1D : public IFunction
 {
 public:
     /** Default Constructor */
-    NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager);
+    NEFFT1D() : NEFFT1D(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFT1D(const NEFFT1D &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index 84bfe6b02f..2b98562fe8 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -59,7 +60,10 @@ class NEFFTConvolutionLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEFFTConvolutionLayer() : NEFFTConvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFTConvolutionLayer(const NEFFTConvolutionLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
@@ -172,4 +176,4 @@ class NEFFTConvolutionLayer : public IFunction
     bool           _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEFFTCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index a0c03af351..78d27c7376 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -87,8 +88,10 @@ class NEFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
-                          IWeightsManager                *weights_manager = nullptr);
+    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager = nullptr);
+    NEFullyConnectedLayer() : NEFullyConnectedLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 70493edd69..1f5e51e598 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -41,7 +42,10 @@ class NEGEMM : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager = nullptr);
+    NEGEMM() : NEGEMM(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMM(const NEGEMM &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index a4e2fce7c1..54def64d18 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -50,7 +51,10 @@ class NEGEMMConv2d : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager);
+    NEGEMMConv2d() : NEGEMMConv2d(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConv2d(const NEGEMMConv2d &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 7baa940f82..d37d49b48c 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -47,8 +48,11 @@ class NEGEMMConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager  = nullptr,
+    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
                            IWeightsManager                       *weights_manager = nullptr);
+    NEGEMMConvolutionLayer() : NEGEMMConvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains non movable objects) */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index c2d3089027..81c6115791 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -47,8 +48,11 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
                                  IWeightsManager                *weights_manager = nullptr);
+    NEGEMMLowpMatrixMultiplyCore() : NEGEMMLowpMatrixMultiplyCore(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 0032d0c26d..4ee3cf9f3a 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
@@ -62,7 +63,10 @@ class NEGenerateProposalsLayer : public IFunction
      *
      * @param[in] memory_manager (Optional) Memory manager.
      */
-    NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEGenerateProposalsLayer() : NEGenerateProposalsLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGenerateProposalsLayer(const NEGenerateProposalsLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index b7ff9965db..a719f375fb 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -47,7 +48,10 @@ class NEInstanceNormalizationLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEInstanceNormalizationLayer() : NEInstanceNormalizationLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEInstanceNormalizationLayer(const NEInstanceNormalizationLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index c29738d8e7..0e164ee43f 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -47,7 +48,10 @@ class NEL2NormalizeLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEL2NormalizeLayer() : NEL2NormalizeLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEL2NormalizeLayer(const NEL2NormalizeLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 8416111881..2c723840af 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -47,7 +48,10 @@ class NELSTMLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NELSTMLayer() : NELSTMLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayer(const NELSTMLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index ae951669b3..72efde36ee 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
-#define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -63,7 +64,10 @@ class NELSTMLayerQuantized : public IFunction
 {
 public:
     /** Default constructor */
-    NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager);
+    NELSTMLayerQuantized() : NELSTMLayerQuantized(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayerQuantized(const NELSTMLayerQuantized &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -233,4 +237,4 @@ class NELSTMLayerQuantized : public IFunction
     bool _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NELSTMLAYERQUANTIZED_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NELSTMLAYERQUANTIZED_H
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
index ae0e317d2e..6b80917716 100644
--- a/arm_compute/runtime/NEON/functions/NEMatMul.h
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -78,7 +79,10 @@ class NEMatMul : public IFunction
 {
 public:
     /** Constructor */
-    NEMatMul();
+    NEMatMul(std::shared_ptr<IMemoryManager> memory_manager);
+    NEMatMul() : NEMatMul(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Destructor */
     ~NEMatMul();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index ce615c62e0..0f09f80c62 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -49,7 +50,10 @@ class NENormalizationLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NENormalizationLayer() : NENormalizationLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NENormalizationLayer(const NENormalizationLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
index ce8aeca790..eabcd32536 100644
--- a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -43,7 +44,10 @@ class NEPooling3dLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEPooling3dLayer() : NEPooling3dLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEPooling3dLayer(const NEPooling3dLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index 51f7d982f0..4aa1b205fc 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -28,6 +28,7 @@
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -45,7 +46,10 @@ class NEPoolingLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NEPoolingLayer() : NEPoolingLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEPoolingLayer(const NEPoolingLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index af7f464ac9..8ba2a4911d 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NERNNLAYER_H
-#define ARM_COMPUTE_NERNNLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
@@ -41,7 +42,10 @@ class NERNNLayer : public IFunction
 {
 public:
     /** Default constructor */
-    NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager);
+    NERNNLayer() : NERNNLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NERNNLayer(const NERNNLayer &) = delete;
     /** Prevent instances of this class from being moved (As this class contains pointers) */
@@ -116,4 +120,4 @@ class NERNNLayer : public IFunction
     bool                  _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NERNNLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NERNNLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 5b8d8cdf2b..eb02099b86 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
-#define ARM_COMPUTE_NEON_REDUCE_MEAN_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -38,7 +39,10 @@ class NEReduceMean : public IFunction
 {
 public:
     /** Constructor */
-    NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager);
+    NEReduceMean() : NEReduceMean(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEReduceMean(const NEReduceMean &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -95,4 +99,4 @@ class NEReduceMean : public IFunction
     bool                              _keep_dims;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCEMEAN_H
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index 8b56e17f65..c46d0e84e0 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -25,6 +25,7 @@
 #define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREDUCTIONOPERATION_H
 
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -45,7 +46,10 @@ class NEReductionOperation : public IFunction
 {
 public:
     /** Default constructor */
-    NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager);
+    NEReductionOperation() : NEReductionOperation(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEReductionOperation(const NEReductionOperation &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 1787de6237..276f5dc287 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NESOFTMAXLAYER_H
-#define ARM_COMPUTE_NESOFTMAXLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include <memory>
 
@@ -41,7 +42,10 @@ class NESoftmaxLayerGeneric : public IFunction
 {
 public:
     /** Constructor */
-    NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager);
+    NESoftmaxLayerGeneric() : NESoftmaxLayerGeneric(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NESoftmaxLayerGeneric(const NESoftmaxLayerGeneric &) = delete;
     /** Default move constructor */
@@ -98,4 +102,4 @@ using NESoftmaxLayer    = NESoftmaxLayerGeneric<false>;
 using NELogSoftmaxLayer = NESoftmaxLayerGeneric<true>;
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESOFTMAXLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESOFTMAXLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 7b00fd3b9d..dce1e3e764 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -49,7 +50,10 @@ class NEWinogradConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager);
+    NEWinogradConvolutionLayer() : NEWinogradConvolutionLayer(MemoryManagerOnDemand::make_default())
+    {
+    }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
     /** Default move constructor */
diff --git a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h
index bc0e36cb0a..5958382f6c 100644
--- a/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h
+++ b/arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h
@@ -111,6 +111,13 @@ class CpuGemmAssemblyDispatch : arm_compute::experimental::IOperator
                    const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Indicates whether or not this function can be used to process the given parameters.
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |nullptr   |F32            |
+     * |F16          |F16         |nullptr   |F16            |
+     * |BFLOAT16     |BFLOAT16    |nullptr   |BFLOAT16       |
+     * |BFLOAT16     |BFLOAT16    |nullptr   |BFLOAT32       |
      *
      * @param[in] a         Input tensor info (Matrix A)
      * @param[in] b         Input tensor info (Matrix B)
diff --git a/arm_compute/runtime/experimental/operators/CpuDequantize.h b/arm_compute/runtime/experimental/operators/CpuDequantize.h
new file mode 100644
index 0000000000..90b3ebd107
--- /dev/null
+++ b/arm_compute/runtime/experimental/operators/CpuDequantize.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H
+#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+
+#include <memory>
+
+/*
+ * A shallow wrapper for arm_compute::cpu::CpuDequantize.
+ * Any new features should be added to arm_compute::cpu::CpuDequantize and
+ * arm_compute::experimental::op::CpuDequantize should remain a shallow wrapper.
+*/
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+/** A simple wrapper class which runs cpu::CpuDequantize that dequantizes an input tensor */
+class CpuDequantize : public INEOperator
+{
+public:
+    /** Default Constructor */
+    CpuDequantize();
+    /** Default Destructor */
+    ~CpuDequantize();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuDequantize(const CpuDequantize &) = delete;
+    /** Default move constructor */
+    CpuDequantize(CpuDequantize &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuDequantize &operator=(const CpuDequantize &) = delete;
+    /** Default move assignment operator */
+    CpuDequantize &operator=(CpuDequantize &&) = default;
+    /** Configure the kernel.
+     *
+     * Valid configurations and data layouts can be referenced in @ref arm_compute::NEDequantizationLayer.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuDequantize
+     *
+     * Similar to @ref CpuDequantize::configure
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUDEQUANTIZE_H
diff --git a/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h b/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h
new file mode 100644
index 0000000000..0ca7113d8f
--- /dev/null
+++ b/arm_compute/runtime/experimental/operators/CpuGEMMLowp.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H
+#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+
+namespace experimental
+{
+namespace op
+{
+/*
+ * A shallow wrapper for arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore.
+ * Any new features should be added to arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore and
+ * arm_compute::experimental::op::CpuGEMMLowp should remain a shallow wrapper.
+*/
+class CpuGEMMLowp : public INEOperator
+{
+public:
+    /** Constructor */
+    CpuGEMMLowp();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGEMMLowp(const CpuGEMMLowp &) = delete;
+    /** Default move constructor */
+    CpuGEMMLowp(CpuGEMMLowp &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuGEMMLowp &operator=(const CpuGEMMLowp &) = delete;
+    /** Default move assignment operator */
+    CpuGEMMLowp &operator=(CpuGEMMLowp &&) = default;
+    /** Default destructor */
+    ~CpuGEMMLowp();
+    /** Initialise the kernel's inputs, output
+     *
+     *valid configurations can be referenced in @ref arm_compute::NEGEMMLowpMatrixMultiplyCore.
+     */
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *output,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGEMMLowp
+     *
+     * Similar to @ref CpuGEMMLowp::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
+
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUGEMMLOWP_H
diff --git a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
index 2bbc7148d5..6cb539a6dc 100644
--- a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
+++ b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
@@ -137,6 +137,13 @@ class CpuGemmConv2d : public IOperator
                                const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                                bool                       enable_fast_math = false);
 
+    /** Update of quantization information at the run stage for convolution so that the quantization multipliers can be properly calculated.
+     * Please @ref NEGEMMConvolutionLayer for a more in-depth explanation and example.
+     *
+     * @param[in] tensors Vector that contains the tensors to operate on.
+     */
+    void update_quantization_parameters(ITensorPack &tensors);
+
     void                             run(ITensorPack &tensors) override;
     void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
diff --git a/arm_compute/runtime/experimental/operators/CpuQuantize.h b/arm_compute/runtime/experimental/operators/CpuQuantize.h
new file mode 100644
index 0000000000..962204ca36
--- /dev/null
+++ b/arm_compute/runtime/experimental/operators/CpuQuantize.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H
+#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+/*
+ * A shallow wrapper for arm_compute::cpu::CpuQuantize.
+ * Any new features should be added to arm_compute::cpu::CpuQuantize and
+ * arm_compute::experimental::op::CpuQuantize should remain a shallow wrapper.
+*/
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+
+/** A simple wrapper class which runs cpu::CpuQuantize */
+class CpuQuantize : public arm_compute::experimental::INEOperator
+{
+public:
+    CpuQuantize();
+    /** Default Destructor */
+    ~CpuQuantize();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuQuantize(const CpuQuantize &) = delete;
+    /** Default move constructor */
+    CpuQuantize(CpuQuantize &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuQuantize &operator=(const CpuQuantize &) = delete;
+    /** Default move assignment operator */
+    CpuQuantize &operator=(CpuQuantize &&) = default;
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst                                    |
+     * |:------------------|:--------------------------------------|
+     * |QASYMM8            |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |QASYMM8_SIGNED     |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F16                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F32                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     *
+     * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantize
+     *
+     * Similar to @ref CpuQuantize::configure()
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUQUANTIZE_H
diff --git a/docs/Doxyfile b/docs/Doxyfile
index d92a65f340..0621168e94 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -60,7 +60,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.09
+PROJECT_NUMBER         = 24.11
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox
index 1f97ccc458..371c2e1133 100644
--- a/docs/user_guide/library.dox
+++ b/docs/user_guide/library.dox
@@ -606,10 +606,5 @@ Supported data-types options are:
 
 The list of supported operators can be found in filelist.json in the root of Compute Library repo.
 
-@subsection architecture_experimental_build_high_priority_operators Build high priority operators
-
-Selecting high_priority when building Compute Library, one new library will be created: libarm_compute_hp and
-will contain a selected subset of the libary operators. Currently the operators are staticly set.
-
 */
 } // namespace arm_compute
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index f423260fb5..47d872e0f1 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -455,16 +455,22 @@ where N = batches, C = channels, H = height, W = width, D = depth
   <td>
     <table>
     <tr><th>src<th>dst
-    <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32
-    <tr><td>S8<td>U8, U16, S16, U32, S32, F16, F32
-    <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32
-    <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32
-    <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32
-    <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32
-    <tr><td>U64<td>U8, S8, U16, S16, U32, S32, F16, F32
-    <tr><td>S64<td>U8, S8, U16, S16, U32, S32, F16, F32
-    <tr><td>F16<td>U8, S8, U16, S16, S32, U32, F32
-    <tr><td>F32<td>U8, S8, U16, S16, S32, U32, F16
+    <tr><td>U8<td>S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized
+    <tr><td>S8<td>U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized
+    <tr><td>U16<td>U8, S8, S16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16
+    <tr><td>S16<td>U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16
+    <tr><td>U32<td>U8, S8, U16, S16, S32, F16, F32, All Quantized
+    <tr><td>S32<td>U8, S8, U16, S16, U32, F16, F32, All Quantized
+    <tr><td>U64<td>U8, S8, U16, S16, U32, S32, F16, F32, All Quantized
+    <tr><td>S64<td>U8, S8, U16, S16, U32, S32, F16, F32, All Quantized
+    <tr><td>F16<td>U8, S8, U16, S16, S32, U32, F32, All Quantized
+    <tr><td>F32<td>U8, S8, U16, S16, S32, U32, F16, All Quantized
+    <tr><td>QASYMM8<td>S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized
+    <tr><td>QASYMM8_SIGNED<td>U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized
+    <tr><td>QSYMM8<td>U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized
+    <tr><td>QSYMM8_PER_CHANNEL<td>U8, U16, S16, U32, S32, F16, F32, 16-bit Quantized
+    <tr><td>QASYMM16<td>U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16
+    <tr><td>QSYMM16<td>U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16
     </table>
 <tr>
   <td rowspan="2">ChannelShuffleLayer
diff --git a/filelist.json b/filelist.json
index 5b49a68692..38cdff601d 100644
--- a/filelist.json
+++ b/filelist.json
@@ -109,15 +109,6 @@
       "src/c/operators/AclActivation.cpp"
     ]
   },
-  "high_priority": [
-    "Activation",
-    "DepthwiseConv2d",
-    "Conv2d",
-    "Permute",
-    "Pool2d",
-    "Reshape",
-    "MatMul"
-  ],
   "gpu": {
     "common": [
       "src/core/CL/CLCompileContext.cpp",
@@ -1612,11 +1603,14 @@
             "src/runtime/experimental/operators/CpuActivation.cpp",
             "src/runtime/experimental/operators/CpuAdd.cpp",
             "src/runtime/experimental/operators/CpuDepthwiseConv2d.cpp",
+            "src/runtime/experimental/operators/CpuDequantize.cpp",
             "src/runtime/experimental/operators/CpuElementwise.cpp",
+            "src/runtime/experimental/operators/CpuGEMMLowp.cpp",
             "src/runtime/experimental/operators/CpuGemm.cpp",
             "src/runtime/experimental/operators/CpuGemmConv2d.cpp",
             "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
             "src/runtime/experimental/operators/CpuMul.cpp",
+            "src/runtime/experimental/operators/CpuQuantize.cpp",
             "src/runtime/experimental/operators/CpuSoftmax.cpp",
             "src/runtime/experimental/operators/CpuSub.cpp",
             "src/runtime/experimental/operators/CpuTranspose.cpp",
@@ -2312,7 +2306,7 @@
             "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
           },
           "sve": {
-            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp" ]
+            "common": [ "src/cpu/kernels/softmax/generic/sve/impl.cpp", "src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp" ]
           },
           "sve2":{
             "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
diff --git a/scripts/format_code.py b/scripts/format_code.py
index 8bfb3f5601..6da63fff81 100755
--- a/scripts/format_code.py
+++ b/scripts/format_code.py
@@ -142,20 +142,20 @@ def check_license(filename):
     f.close()
 
     f = open(filename, "w")
-    f.write("".join(content[:2]))
+    f.write("".join(content[:3]))
 
     year = datetime.datetime.now().year
     # This only works until year 9999
-    m = re.match(r"(.*Copyright \(c\) )(.*\d{4})( [Arm|ARM].*)", content[2])
+    m = re.match(r"(.*FileCopyrightText: )(.*\d{4})( [arm|Arm|ARM].*)", content[3])
 
     if not m:
-        f.write("Copyright (c) {} Arm Limited\n".format(year))
+        f.write("# SPDX-FileCopyrightText: {} Arm Limited\n#\n".format(year))
     else:
         updated_year = adjust_copyright_year(m.group(2), year)
-        f.write("Copyright (c) {} Arm Limited\n".format(updated_year))
+        f.write("# SPDX-FileCopyrightText: {} Arm Limited\n".format(updated_year))
 
     # Copy the rest of the file's content:
-    f.write("".join(content[3:]))
+    f.write("".join(content[4:]))
     f.close()
 
 
@@ -276,7 +276,7 @@ def run(self):
 
                 logger.info("Formatting %s" % f)
 
-            check_license("LICENSE")
+            check_license("LICENSES/MIT.txt")
 
         except subprocess.CalledProcessError as e:
             retval = -1
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index 4aa157efd5..ed869de9aa 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -358,7 +358,8 @@ filegroup(
 	"cpu/kernels/scale/sve/integer.cpp",
 	"cpu/kernels/scale/sve/qasymm8.cpp",
 	"cpu/kernels/scale/sve/qasymm8_signed.cpp",
-	"cpu/kernels/softmax/generic/sve/impl.cpp"]  +
+	"cpu/kernels/softmax/generic/sve/impl.cpp",
+	"cpu/kernels/softmax/generic/sve/impl_bf16.cpp"]  +
     glob(["**/*.h",
     "**/*.hpp",
     "**/*.inl"]),
@@ -1042,11 +1043,14 @@ filegroup(
 	"runtime/experimental/operators/CpuActivation.cpp",
 	"runtime/experimental/operators/CpuAdd.cpp",
 	"runtime/experimental/operators/CpuDepthwiseConv2d.cpp",
+	"runtime/experimental/operators/CpuDequantize.cpp",
 	"runtime/experimental/operators/CpuElementwise.cpp",
+	"runtime/experimental/operators/CpuGEMMLowp.cpp",
 	"runtime/experimental/operators/CpuGemm.cpp",
 	"runtime/experimental/operators/CpuGemmConv2d.cpp",
 	"runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
 	"runtime/experimental/operators/CpuMul.cpp",
+	"runtime/experimental/operators/CpuQuantize.cpp",
 	"runtime/experimental/operators/CpuSoftmax.cpp",
 	"runtime/experimental/operators/CpuSub.cpp",
 	"runtime/experimental/operators/CpuTranspose.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 58eca30847..e8ae6705ac 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -328,6 +328,7 @@ target_sources(
 	cpu/kernels/scale/sve/qasymm8.cpp
 	cpu/kernels/scale/sve/qasymm8_signed.cpp
 	cpu/kernels/softmax/generic/sve/impl.cpp
+	cpu/kernels/softmax/generic/sve/impl_bf16.cpp
 )
 
 target_sources(
@@ -1033,11 +1034,14 @@ target_sources(
 	runtime/experimental/operators/CpuActivation.cpp
 	runtime/experimental/operators/CpuAdd.cpp
 	runtime/experimental/operators/CpuDepthwiseConv2d.cpp
+	runtime/experimental/operators/CpuDequantize.cpp
 	runtime/experimental/operators/CpuElementwise.cpp
+	runtime/experimental/operators/CpuGEMMLowp.cpp
 	runtime/experimental/operators/CpuGemm.cpp
 	runtime/experimental/operators/CpuGemmConv2d.cpp
 	runtime/experimental/operators/CpuGemmDirectConv2d.cpp
 	runtime/experimental/operators/CpuMul.cpp
+	runtime/experimental/operators/CpuQuantize.cpp
 	runtime/experimental/operators/CpuSoftmax.cpp
 	runtime/experimental/operators/CpuSub.cpp
 	runtime/experimental/operators/CpuTranspose.cpp
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index 2352e27a17..09e220e75e 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -417,9 +417,12 @@ CpuInfo CpuInfo::build()
 #elif defined(__aarch64__) && defined(_WIN64)    /* #elif defined(__aarch64__) && defined(__APPLE__) */
     CpuIsaInfo isainfo;
     isainfo.neon = true;
-    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+    isainfo.dot  = IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE);
+    if (NTDDI_VERSION >= NTDDI_WIN11_GE)
     {
-        isainfo.dot = true;
+        isainfo.fp16 = IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE);
+        isainfo.sve  = IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE);
+        isainfo.i8mm = IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE);
     }
     SYSTEM_INFO sysinfo;
     GetSystemInfo(&sysinfo);
diff --git a/src/core/CL/cl_kernels/common/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
index 036a683ec7..e2de6dd8eb 100644
--- a/src/core/CL/cl_kernels/common/cast.cl
+++ b/src/core/CL/cl_kernels/common/cast.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,19 +70,14 @@ __kernel void cast_down(
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
     in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
 
-#if defined(IS_DATA_TYPE_QUANTIZED)
-    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
-#endif // defined(IS_DATA_TYPE_QUANTIZED)
+#if defined(QSYMM8_PER_CHANNEL_TO_QASYMM8)
+    // This operation mode is used in Gemmlowp
+    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)) 0x80;
+#endif // defined(QSYMM8_PER_CHANNEL_TO_QASYMM8)
 
-#if defined(IS_DATA_TYPE_FLOAT)
     VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
     res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(IS_DATA_TYPE_FLOAT) */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
 
 /** This function performs a up-casting
@@ -122,13 +117,7 @@ __kernel void cast_up(
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
     in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
 
-#if defined(IS_DATA_TYPE_FLOAT)
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(IS_DATA_TYPE_FLOAT) */
     VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
     res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
     STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(IS_DATA_TYPE_FLOAT) */
 }
diff --git a/src/core/CL/cl_kernels/common/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
index 69cc288c25..072a9721c4 100644
--- a/src/core/CL/cl_kernels/common/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,6 +81,10 @@ __kernel void quantization_layer(
     // Create scale and offset vectors
     const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
     const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
 #else  // defined(IS_FLOAT)
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
@@ -91,18 +95,25 @@ __kernel void quantization_layer(
 
     // Create scale and offset vectors
     const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#endif // defined(IS_FLOAT)
+    const VEC_DATA_TYPE(float, VEC_SIZE) voffset = OFFSET;
 
     // Quantize
     VEC_DATA_TYPE(int, VEC_SIZE)
-    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale + voffset, int, VEC_SIZE), MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(IS_FLOAT)
 
     // Store result
     VSTORE(VEC_SIZE)
     (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
 #else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+
+    // Each thread computes a single element
+#if defined(IS_FLOAT)
     *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
+#else // !defined(IS_FLOAT)
+    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE) + (float)OFFSET, int), MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(IS_FLOAT)
+
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 #endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index bfc0995bb8..60258938d1 100644
--- a/src/core/CL/cl_kernels/common/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -169,7 +169,7 @@ __kernel void softmax_x(
     // Normalize the data.
 #ifdef IS_QUANTIZED
 # if IS_LOG
-    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+    TMP_DATA_TYPE norm_offset = -log(sum_value) / DST_SCALE + DST_OFFSET;
 #  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
 # else // IS_LOG
     TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
@@ -333,7 +333,7 @@ __kernel void softmax_non_x(
     // Normalize the data.
 #ifdef IS_QUANTIZED
 # if IS_LOG
-    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) / DST_SCALE + DST_OFFSET;
 #  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
 # else // IS_LOG
     VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index a6d08e5bad..79adb04a09 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -137,7 +137,7 @@ unsigned int CPUInfo::get_L2_cache_size() const
     return _impl->L2_cache_size;
 }
 
-uint64_t CPUInfo::get_sme2_vector_length() const
+uint64_t CPUInfo::get_sme2_vector_length_in_bytes() const
 {
 #ifdef ARM_COMPUTE_ENABLE_SME2
     if (this->has_sme2())
@@ -148,6 +148,12 @@ uint64_t CPUInfo::get_sme2_vector_length() const
     return 0;
 #endif // ARM_COMPUTE_ENABLE_SME2
 }
+
+uint64_t CPUInfo::get_sme2_vector_length_in_bits() const
+{
+    return get_sme2_vector_length_in_bytes() * 8;
+}
+
 unsigned int CPUInfo::get_cpu_num_excluding_little() const
 {
 #if defined(__ANDROID__)
diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index b93e64a0ef..522369309b 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h
@@ -651,6 +651,26 @@ inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int3
     return rf;
 }
 
+inline int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, float offset)
+{
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf        = {{
+#ifdef __aarch64__
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+    }};
+    return rf;
+}
+
 /** Quantize a neon vector holding 16 floating point values.
  *
  * @param[in] qv Input values to be quantized.
@@ -666,6 +686,14 @@ inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationIn
     return vcombine_u8(pa, pb);
 }
 
+inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformRequantizationInfo &qi)
+{
+    auto            rf = vquantize_internal(qv, qi.scale, qi.offset);
+    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+    return vcombine_u8(pa, pb);
+}
+
 /** Signed quantize a neon vector holding 16 floating point values.
  *
  * @param[in] qv Input values to be quantized.
@@ -681,6 +709,14 @@ inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantiza
     return vcombine_s8(pa, pb);
 }
 
+inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformRequantizationInfo &qi)
+{
+    auto           rf = vquantize_internal(qv, qi.scale, qi.offset);
+    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+    return vcombine_s8(pa, pb);
+}
+
 /** Quantize to QASYMM16 a neon vector holding 16 floating point values.
  *
  * @param[in] qv Input values to be quantized.
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
index 1e4de4a39e..04aa63019f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16bf16.cpp
@@ -45,6 +45,7 @@ GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
     [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_a64_ffinterleaved_bf16fp32_mmla_8x12, bfloat16, bfloat16>(args); }
 ),
+#ifdef ARM_COMPUTE_ENABLE_SVE
 GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
     "sve_ffinterleaved_bf16fp32_mmla_8x3VL",
@@ -53,6 +54,7 @@ GemmImplementation<bfloat16, bfloat16, bfloat16>::with_estimate(
     [](const GemmArgs &args) { return GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16>::estimate_cycles<bfloat16>(args); },
     [](const GemmArgs &args) { return new GemmInterleavedFixedFormat<cls_sve_ffinterleaved_bf16fp32_mmla_8x3VL, bfloat16, bfloat16>(args); }
 ),
+#endif // ARM_COMPUTE_ENABLE_SVE
 #endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 #endif // ARM_COMPUTE_ENABLE_BF16
 #endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index d0a8635604..39fd653a6a 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -172,10 +172,12 @@ namespace utils {
 // get_vector_length(): Returns SVE vector length for type "T".
 //
 // It is required that this can be compiled by a compiler in non-SVE mode, but it must be prevented from running (at
-// runtime) if SVE is not enabled.  Typically this is used by switchyard/driver code which is built in normal mode
+// runtime) if SVE is not enabled. Typically this is used by switchyard/driver code which is built in normal mode
 // which then calls SVE kernels (compiled accordingly) iff SVE is detected at runtime.
 template <typename T>
 inline unsigned long get_vector_length() {
+// x0 register is not available in 32-bit builds
+#if defined(__aarch64__)
     uint64_t vl;
 
     __asm __volatile (
@@ -185,10 +187,13 @@ inline unsigned long get_vector_length() {
         :
         : "x0"
     );
-
     return vl / sizeof(T);
+#else // !defined(__aarch64__)
+    return 16 / sizeof(T);
+#endif // defined(__aarch64__)
 }
 
+#ifdef __aarch64__
 namespace sme {
 
 template <typename T>
@@ -207,6 +212,7 @@ inline uint64_t get_vector_length() {
 }
 
 } // namespace sme
+#endif // __aarch64__
 
 // get_vector_length(VLType): Returns vector length for type "T".
 //
diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 381de2284a..4b9e110b87 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2022-2023 Arm Limited.
+ * Copyright (c) 2020, 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_CVT_H
-#define ARM_COMPUTE_WRAPPER_CVT_H
+#ifndef ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H
+#define ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H
 
 #include <arm_neon.h>
 
@@ -82,6 +82,18 @@ inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type
 {
     return vcvtaq_s32_f32(a);
 }
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvtn(const float32x4_t &a)
+{
+    return vcvtnq_u32_f32(a);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvtn(const float32x4_t &a)
+{
+    return vcvtnq_s32_f32(a);
+}
 #endif //__aarch64__
 
 #if defined(ARM_COMPUTE_ENABLE_BF16)
@@ -104,4 +116,4 @@ inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
 
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CVT_H */
+#endif // ACL_SRC_CORE_NEON_WRAPPER_INTRINSICS_CVT_H
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index cd849c3666..5278a11db4 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -207,8 +207,14 @@
 
 #if defined(ARM_COMPUTE_ENABLE_BF16)
 #define REGISTER_BF16_NEON(func_name) &(func_name)
-#else /* !(defined(ARM_COMPUTE_ENABLE_BF16))*/
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#define REGISTER_BF16_SVE(func_name) &(func_name)
+#endif /* !defined(ARM_COMPUTE_ENABLE_SVE)*/
+#else  /* !(defined(ARM_COMPUTE_ENABLE_BF16))*/
 #define REGISTER_BF16_NEON(func_name) nullptr
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#define REGISTER_BF16_SVE(func_name) nullptr
+#endif /* !defined(ARM_COMPUTE_ENABLE_SVE)*/
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16)*/
 
 #endif // ACL_SRC_CORE_COMMON_REGISTRARS_H
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
index 62ad2bab6d..5c9547af53 100644
--- a/src/core/helpers/LUTManager.cpp
+++ b/src/core/helpers/LUTManager.cpp
@@ -59,16 +59,35 @@ inline float16_t activation(float16_t x, const LUTInfo &info)
     return out;
 }
 
+inline float exponential(float fp, const LUTInfo &info)
+{
+    return std::exp(fp * info.beta);
+}
+
 // Read bf16 value as u16, convert to fp32.
 // Calculate exp in fp32, return as bf16
-inline uint16_t exponential(uint16_t x, const LUTInfo &info)
+inline uint16_t exponential_bf16(uint16_t x, const LUTInfo &info)
 {
     float fp = bf16_to_float(x);
-    fp       = std::exp(fp * info.beta * -1);
+    fp       = exponential(fp, info);
     return float_to_bf16(fp);
 }
 
-void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info)
+void init_lut(LookupTable256 &lut, const LUTInfo &info)
+{
+    // assert lut is valid config.
+    ARM_COMPUTE_ASSERT((info.type == LUTType::Exponential && info.dt == DataType::QASYMM8) ||
+                       (info.type == LUTType::Exponential && info.dt == DataType::QASYMM8_SIGNED));
+
+    for (int i = 0; i < 256; ++i)
+    {
+        const float deq = info.dt == DataType::QASYMM8 ? dequantize_qasymm8(i, info.qinfo)
+                                                       : dequantize_qasymm8_signed(i - 128, info.qinfo);
+        lut[i]          = exponential(deq, info);
+    }
+}
+
+void init_lut(LookupTable65536 &lut, const LUTInfo &info)
 {
     // assert lut is valid config.
     ARM_COMPUTE_ASSERT((info.type == LUTType::Activation && info.dt == DataType::F16) ||
@@ -82,13 +101,13 @@ void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info)
         {
             case LUTType::Activation:
             {
-                (*lut)[item.i] = activation(item.fp, info);
+                lut[item.i] = activation(item.fp, info);
                 break;
             }
             case LUTType::Exponential:
             {
-                bf16.i         = exponential(item.i, info);
-                (*lut)[item.i] = bf16.fp;
+                bf16.i      = exponential_bf16(item.i, info);
+                lut[item.i] = bf16.fp;
                 break;
             }
             default:
@@ -103,10 +122,24 @@ void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info)
 
 } // namespace
 
-std::shared_ptr<LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
+template <>
+inline std::map<LUTInfo, std::weak_ptr<LookupTable256>> &LUTManager::get_map<LookupTable256>()
+{
+    return map_fp32;
+}
+
+template <>
+inline std::map<LUTInfo, std::weak_ptr<LookupTable65536>> &LUTManager::get_map<LookupTable65536>()
 {
-    const auto itr   = map_fp16.find(info);
-    auto       s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
+    return map_fp16;
+}
+
+template <typename T>
+std::shared_ptr<T> LUTManager::get_lut_table(LUTInfo info)
+{
+    auto      &map   = get_map<T>();
+    const auto itr   = map.find(info);
+    auto       s_ptr = (itr != map.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
     if (s_ptr != nullptr)
     {
         // Found and valid
@@ -116,12 +149,15 @@ std::shared_ptr<LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
     {
         // Not found, or pointer not valid
         // We do not use make_shared to prevent the weak_ptr keeping the control block alive
-        std::shared_ptr<LookupTable65536> ptr(new LookupTable65536);
-        init_lut_16bit(ptr.get(), info);
-        map_fp16[info] = ptr;
+        std::shared_ptr<T> ptr(new T);
+        init_lut(*ptr, info);
+        map[info] = ptr;
         return ptr;
     }
 }
+
+template std::shared_ptr<LookupTable256>   LUTManager::get_lut_table<LookupTable256>(LUTInfo info);
+template std::shared_ptr<LookupTable65536> LUTManager::get_lut_table<LookupTable65536>(LUTInfo info);
 #endif // __aarch64__
 
 // Static function to get LutManager instance
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
index 226f44f360..eca9472f41 100644
--- a/src/core/helpers/LUTManager.h
+++ b/src/core/helpers/LUTManager.h
@@ -35,14 +35,14 @@
 namespace arm_compute
 {
 #ifdef __aarch64__
-using LookupTable256   = std::array<qasymm8_t, 256>;
+using LookupTable256   = std::array<float, 256>;
 using LookupTable65536 = std::array<float16_t, 65536>;
 #endif // __aarch64__
 
 enum class LUTType
 {
     Activation,  // Determined by activation type
-    Exponential, // e^x
+    Exponential, // e^(beta * x)
 };
 
 struct LUTInfo
@@ -76,7 +76,7 @@ struct LUTInfo
     ActivationLayerInfo::ActivationFunction act;
     float                                   alpha;
     float                                   beta;
-    DataType                                dt;
+    DataType                                dt; // What datatype the table is indexed with.
     UniformQuantizationInfo                 qinfo;
     LUTType                                 type; // Default is Activation.
 };
@@ -89,9 +89,14 @@ class LUTManager
 
     static LUTManager &get_instance();
 #ifdef __aarch64__
-    std::shared_ptr<LookupTable65536> get_lut_table(LUTInfo info);
+    template <typename T>
+    std::shared_ptr<T> get_lut_table(LUTInfo info);
 
 private:
+    template <typename T>
+    inline std::map<LUTInfo, std::weak_ptr<T>> &get_map();
+
+    std::map<LUTInfo, std::weak_ptr<LookupTable256>>   map_fp32{};
     std::map<LUTInfo, std::weak_ptr<LookupTable65536>> map_fp16{};
 #endif // __aarch64__
 };
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index c02691d5db..08901437a1 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -53,6 +53,12 @@ static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_acti
     ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
     ActivationLayerInfo::ActivationFunction::LEAKY_RELU,   ActivationLayerInfo::ActivationFunction::GELU,
 };
+
+/* Static quantization can only, currently, support relu based activations */
+static const std::array<ActivationLayerInfo::ActivationFunction, 3> qasymm8_static_quant_activations = {
+    ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
 /* Supported activation in the 16-bit integer domain */
 static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
     ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
@@ -72,6 +78,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
     const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
 
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized_asymmetric_char(data_type) && oq_info.is_dynamic() &&
+            (std::find(std::begin(qasymm8_static_quant_activations), std::end(qasymm8_static_quant_activations),
+                       f_act) == std::end(qasymm8_static_quant_activations)),
+        "For QASYMM8 statically quantized, only relu and lower/upper bounded relu are supported");
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
         is_data_type_quantized_asymmetric(data_type) &&
             (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
@@ -114,6 +126,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
 }
 
 #ifdef __aarch64__
+// TODO (COMPMID-7511): delegate to LUTManager
 void init_lut(ActivationLayerInfo::ActivationFunction act_func,
               DataType                                data_type,
               const UniformQuantizationInfo          &qi_in,
@@ -208,6 +221,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
     // Initialise lut_manager
     LUTManager &lut_manager = LUTManager::get_instance();
 
+    // TODO (COMPMID-7511): delegate to LUTManager
     if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) &&
         activation_info.activation() != ActivationFunction::RELU)
     {
@@ -223,7 +237,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
         // Create info using init list.
         const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(),
                               src->quantization_info().uniform()};
-        activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
+        activation_info.setLookupTable65536((lut_manager.get_lut_table<LookupTable65536>(info)));
     }
 #endif // __aarch64__
     _act_info = activation_info;
diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 05c7742b03..b4d44cb5bc 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -893,7 +893,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
-                    /* Down-conversion F32 -> U8 */
+                    /* Down-conversion F32 -> QASYMM8, U8 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
@@ -922,7 +922,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                *(dst_ptr + x) = utils::cast::saturate_static_cast<uint8_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
@@ -958,7 +958,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                *(dst_ptr + x) = utils::cast::saturate_static_cast<int8_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 39ba764c78..17e455b3e3 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -306,7 +306,7 @@ void CpuIm2ColKernel::configure(const ITensorInfo   *src,
     _kernel_height   = kernel_dims.height;
     _input_pad_right = input_pad_right;
     _dilation        = dilation;
-    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), _kernel_width,
                                          _kernel_height, _conv_info, _dilation);
     _has_bias        = has_bias;
 
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index b7e395fb79..b9eb0fcb20 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,6 +34,7 @@
 #include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/LUTManager.h"
 #include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/softmax/list.h"
@@ -51,6 +52,14 @@ namespace
 
 /* Softmax */
 static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+    {"sve_bf16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::BFLOAT16 && data.isa.sve && data.axis == 0); },
+     REGISTER_BF16_SVE(sve_softmax_bf16)},
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_BF16)
     {"sme2_fp32_softmax",
      [](const SoftmaxKernelDataTypeISASelectorData &data)
      { return (!data.is_log && data.dt == DataType::F32 && data.isa.sme2 && data.axis == 0); },
@@ -103,28 +112,6 @@ static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_ker
      REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
-void init_lut(std::vector<float> &lut, DataType type, float scale, float beta)
-{
-    if (type == DataType::QASYMM8)
-    {
-        for (int i = 0; i < 256; ++i)
-        {
-            lut.push_back(std::exp(-scale * beta * i));
-        }
-    }
-    else if (type == DataType::QASYMM8_SIGNED)
-    {
-        for (int i = -128; i < 128; ++i)
-        {
-            lut.push_back(std::exp(-scale * beta * i));
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Invalid datatype for QASYMM8/QASYMM8_SIGNED softmax");
-    }
-}
-
 Status validate_arguments_softmax(
     const ITensorInfo &src, const ITensorInfo &dst, float beta, int axis, const ITensorInfo &tmp, bool is_log)
 {
@@ -132,7 +119,7 @@ Status validate_arguments_softmax(
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
+                                                         DataType::F16, DataType::F32, DataType::BFLOAT16);
 
     ARM_COMPUTE_RETURN_ERROR_ON(axis < 0 || axis > 3);
 
@@ -195,7 +182,7 @@ void CpuSoftmaxKernel::configure(
     }
 
     const auto *uk = CpuSoftmaxKernel::get_implementation(SoftmaxKernelDataTypeISASelectorData{
-        src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length()});
+        src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length_in_bits()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
@@ -232,12 +219,27 @@ void CpuSoftmaxKernel::configure(
 
     ICpuKernel<CpuSoftmaxKernel>::configure(win);
 
+#ifdef __aarch64__
     const std::string uk_name = uk->name;
+
+    if (src->data_type() == DataType::BFLOAT16)
+    {
+        LUTManager &lutmanager = LUTManager::get_instance();
+        LUTInfo     info       = {LUTType::Exponential, beta, DataType::BFLOAT16, UniformQuantizationInfo()};
+        _lut_bf16              = lutmanager.get_lut_table<LookupTable65536>(info);
+    }
+
     if (uk_name == "sme2_qu8_softmax_lut_512VL" || uk_name == "sme2_qs8_softmax_lut_512VL")
     {
-        const float scale = src->quantization_info().uniform().scale;
-        init_lut(_lut, src->data_type(), scale, beta);
+        UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+        // What the ukernel is interested in looking up is exp(b * deq(q)). The
+        // quantization offset cancels out in softmax so we don't need it in
+        // the LUT.
+        qinfo.offset = 0;
+        const LUTInfo info{LUTType::Exponential, -beta, src->data_type(), qinfo};
+        _lut = LUTManager::get_instance().get_lut_table<LookupTable256>(info);
     }
+#endif // __aarch64__
 }
 
 Status CpuSoftmaxKernel::validate(
@@ -274,11 +276,24 @@ void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const
         const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
         void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-        _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut.data());
+#ifdef __aarch64__
+        if (_lut)
+        {
+            _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut->data());
+        }
+        else
+#endif // __aarch64__
+        {
+            _run_method(src, tmp_for_thread, dst, _beta, _axis, window, nullptr);
+        }
     }
     else
     {
+#ifdef __aarch64__
+        _run_method(src, nullptr, dst, _beta, _axis, window, _lut_bf16.get());
+#else  // __aarch64__
         _run_method(src, nullptr, dst, _beta, _axis, window, nullptr);
+#endif // __aarch64__
     }
 }
 
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index becaa42835..c297d37f3f 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H
 
 #include "src/core/common/Macros.h"
+#include "src/core/helpers/LUTManager.h"
 #include "src/cpu/ICpuKernel.h"
 
 namespace arm_compute
@@ -78,11 +79,14 @@ class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
     static const std::vector<SoftmaxKernel> &get_available_kernels();
 
 private:
-    float              _beta{1.0f};
-    SoftmaxKernelPtr   _run_method{nullptr};
-    std::string        _name{};
-    int                _axis{};
-    std::vector<float> _lut = {};
+    float            _beta{1.0f};
+    SoftmaxKernelPtr _run_method{nullptr};
+    std::string      _name{};
+    int              _axis{};
+#ifdef __aarch64__
+    std::shared_ptr<LookupTable256>   _lut{nullptr};
+    std::shared_ptr<LookupTable65536> _lut_bf16 = nullptr;
+#endif // __aarch64__
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 2897f4b242..c331d0bf02 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -224,7 +224,7 @@ void neon_fp16_to_other_dt_cast(
                     // Compute left-over elements
                     for (; x < window_end_x; ++x)
                     {
-                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                        *(dst_ptr + x) = utils::cast::saturate_static_cast<int8_t>(*(src_ptr + x));
                     }
                 },
                 src, dst);
@@ -256,7 +256,7 @@ void neon_fp16_to_other_dt_cast(
                     // Compute left-over elements
                     for (; x < window_end_x; ++x)
                     {
-                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                        *(dst_ptr + x) = utils::cast::saturate_static_cast<uint8_t>(*(src_ptr + x));
                     }
                 },
                 src, dst);
diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 78e3baf74b..1560b38ceb 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
@@ -24,6 +24,8 @@
 #ifndef ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
 #define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H
 
+#include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
@@ -567,7 +569,7 @@ inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &
     return out;
 }
 
-inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
+inline float32x4x4_t load_quantized(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
     qasymm8x16_signed_t x   = vld1q_s8(input1_ptr);
     const float32x4x4_t out = {{
@@ -596,11 +598,14 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
 inline void
 store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
-    int32x4x4_t out = {{
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    // Adjust offset with 0.5 to round to nearest.
+    const float32x4_t adj_offset = vaddq_f32(offset, vdupq_n_f32(0.5f));
+
+    const int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset, rf.val[3], invscale)),
     }};
     store_quantized(output_ptr, out);
 }
@@ -612,31 +617,48 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
     vst1q_s8(output_ptr, vcombine_s8(pa, pb));
 }
 
-inline void store_quantized_signed(int8_t              *output_ptr,
-                                   const float32x4x4_t &rf,
-                                   const float32x4_t   &offset,
-                                   const float32x4_t   &invscale)
-{
-    int32x4x4_t out = {{
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+inline void
+store_quantized(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+{
+    // Adjust offset to round to nearest.
+    const uint32x4x4_t cmp = {{
+#ifdef __aarch64__
+        vcltzq_f32(rf.val[0]),
+        vcltzq_f32(rf.val[1]),
+        vcltzq_f32(rf.val[2]),
+        vcltzq_f32(rf.val[3]),
+#else  // __aarch64__
+        vcltq_f32(rf.val[0], vdupq_n_f32(0.0f)),
+        vcltq_f32(rf.val[1], vdupq_n_f32(0.0f)),
+        vcltq_f32(rf.val[2], vdupq_n_f32(0.0f)),
+        vcltq_f32(rf.val[3], vdupq_n_f32(0.0f)),
+#endif // __aarch64__
+    }};
+    const float32x4_t   neg_point_5 = vdupq_n_f32(-0.5f);
+    const float32x4_t   pos_point_5 = vdupq_n_f32(0.5f);
+    const float32x4x4_t adj_offset  = {{
+         vaddq_f32(offset, vbslq_f32(cmp.val[0], neg_point_5, pos_point_5)),
+         vaddq_f32(offset, vbslq_f32(cmp.val[1], neg_point_5, pos_point_5)),
+         vaddq_f32(offset, vbslq_f32(cmp.val[2], neg_point_5, pos_point_5)),
+         vaddq_f32(offset, vbslq_f32(cmp.val[3], neg_point_5, pos_point_5)),
     }};
-    store_quantized_signed(output_ptr, out);
-}
 
-template <ArithmeticOperation op>
-inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
-    return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
+    const int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(adj_offset.val[0], rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset.val[1], rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset.val[2], rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(adj_offset.val[3], rf.val[3], invscale)),
+    }};
+    store_quantized_signed(output_ptr, out);
 }
 
-template <ArithmeticOperation op>
-inline int8_t
-elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+template <ArithmeticOperation op,
+          typename Output,
+          typename = std::enable_if_t<std::is_same<Output, int8_t>::value || std::is_same<Output, uint8_t>::value>>
+inline Output elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
 {
-    return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
+    const float res = elementwise_arithm_op_scalar<op>(a, b);
+    return Qasymm8QuantizationHelper<Output>::quantize(res, qinfo);
 }
 
 template <ArithmeticOperation op>
@@ -669,19 +691,23 @@ inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x
     return out;
 }
 
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
-                                                int            window_end_x,
-                                                int            window_step_x,
-                                                const uint8_t *input1_ptr,
-                                                const uint8_t *input2_ptr,
-                                                uint8_t       *output_ptr,
-                                                int32x4_t      voffset1,
-                                                int32x4_t      voffset2,
-                                                float32x4_t    vscale1,
-                                                float32x4_t    vscale2,
-                                                float32x4_t    voffseto,
-                                                float32x4_t    invvscaleo)
+template <ArithmeticOperation op,
+          typename Input,
+          typename Output,
+          typename = std::enable_if_t<(std::is_same<Input, int8_t>::value || std::is_same<Input, uint8_t>::value) &&
+                                      (std::is_same<Output, int8_t>::value || std::is_same<Output, uint8_t>::value)>>
+inline int elementwise_arithm_op_quantized_loop(int          window_start_x,
+                                                int          window_end_x,
+                                                int          window_step_x,
+                                                const Input *input1_ptr,
+                                                const Input *input2_ptr,
+                                                Output      *output_ptr,
+                                                int32x4_t    voffset1,
+                                                int32x4_t    voffset2,
+                                                float32x4_t  vscale1,
+                                                float32x4_t  vscale2,
+                                                float32x4_t  voffseto,
+                                                float32x4_t  invvscaleo)
 {
     int x = window_start_x;
     for (; x <= (window_end_x - window_step_x); x += window_step_x)
@@ -695,44 +721,22 @@ inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
     return x;
 }
 
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int           window_start_x,
-                                                       int           window_end_x,
-                                                       int           window_step_x,
-                                                       const int8_t *input1_ptr,
-                                                       const int8_t *input2_ptr,
-                                                       int8_t       *output_ptr,
-                                                       int32x4_t     voffset1,
-                                                       int32x4_t     voffset2,
-                                                       float32x4_t   vscale1,
-                                                       float32x4_t   vscale2,
-                                                       float32x4_t   voffseto,
-                                                       float32x4_t   invvscaleo)
-{
-    int x = window_start_x;
-    for (; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        // Get inputs and compute output
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_start_x,
-                                                          int            window_end_x,
-                                                          int            window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr,
-                                                          float32x4x4_t  broadcast_vector,
-                                                          uint8_t       *output_ptr,
-                                                          int32x4_t      voffset_non_broadcast,
-                                                          float32x4_t    vscale_non_broadcast,
-                                                          float32x4_t    voffseto,
-                                                          float32x4_t    invvscaleo,
-                                                          bool           reorder)
+template <ArithmeticOperation op,
+          typename Input,
+          typename Output,
+          typename = std::enable_if_t<(std::is_same<Input, int8_t>::value || std::is_same<Input, uint8_t>::value) &&
+                                      (std::is_same<Output, int8_t>::value || std::is_same<Output, uint8_t>::value)>>
+inline int elementwise_arithm_op_quantized_broadcast_loop(int           window_start_x,
+                                                          int           window_end_x,
+                                                          int           window_step_x,
+                                                          const Input  *non_broadcast_input_ptr,
+                                                          float32x4x4_t broadcast_vector,
+                                                          Output       *output_ptr,
+                                                          int32x4_t     voffset_non_broadcast,
+                                                          float32x4_t   vscale_non_broadcast,
+                                                          float32x4_t   voffseto,
+                                                          float32x4_t   invvscaleo,
+                                                          bool          reorder)
 {
     int x = window_start_x;
     for (; x <= (window_end_x - window_step_x); x += window_step_x)
@@ -745,44 +749,22 @@ inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_
     }
     return x;
 }
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int           window_start_x,
-                                                                 int           window_end_x,
-                                                                 int           window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr,
-                                                                 float32x4x4_t broadcast_vector,
-                                                                 int8_t       *output_ptr,
-                                                                 int32x4_t     voffset_non_broadcast,
-                                                                 float32x4_t   vscale_non_broadcast,
-                                                                 float32x4_t   voffseto,
-                                                                 float32x4_t   invvscaleo,
-                                                                 bool          reorder)
-{
-    int x = window_start_x;
-    for (; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af =
-            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf =
-            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
-    }
-    return x;
-}
 
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int            window_start_x,
-                                              int            window_end_x,
-                                              int            window_step_x,
-                                              const uint8_t *input1_ptr,
-                                              const uint8_t *input2_ptr,
-                                              uint8_t       *output_ptr,
-                                              int32x4_t      voffset1,
-                                              int32x4_t      voffset2,
-                                              float32x4_t    vscale1,
-                                              float32x4_t    vscale2,
-                                              float32x4_t    voffseto,
-                                              float32x4_t    invvscaleo)
+template <ComparisonOperation op,
+          typename Input,
+          typename = std::enable_if_t<std::is_same<Input, int8_t>::value || std::is_same<Input, uint8_t>::value>>
+inline int elementwise_comp_op_quantized_loop(int          window_start_x,
+                                              int          window_end_x,
+                                              int          window_step_x,
+                                              const Input *input1_ptr,
+                                              const Input *input2_ptr,
+                                              uint8_t     *output_ptr,
+                                              int32x4_t    voffset1,
+                                              int32x4_t    voffset2,
+                                              float32x4_t  vscale1,
+                                              float32x4_t  vscale2,
+                                              float32x4_t  voffseto,
+                                              float32x4_t  invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
@@ -796,44 +778,20 @@ inline int elementwise_comp_op_quantized_loop(int            window_start_x,
     return x;
 }
 
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int           window_start_x,
-                                                     int           window_end_x,
-                                                     int           window_step_x,
-                                                     const int8_t *input1_ptr,
-                                                     const int8_t *input2_ptr,
-                                                     uint8_t      *output_ptr,
-                                                     int32x4_t     voffset1,
-                                                     int32x4_t     voffset2,
-                                                     float32x4_t   vscale1,
-                                                     float32x4_t   vscale2,
-                                                     float32x4_t   voffseto,
-                                                     float32x4_t   invvscaleo)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for (; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
-        const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(af, bf);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int            window_start_x,
-                                                        int            window_end_x,
-                                                        int            window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr,
-                                                        float32x4x4_t  broadcast_vector,
-                                                        uint8_t       *output_ptr,
-                                                        int32x4_t      voffset_non_broadcast,
-                                                        float32x4_t    vscale_non_broadcast,
-                                                        float32x4_t    voffseto,
-                                                        float32x4_t    invvscaleo,
-                                                        bool           reorder)
+template <ComparisonOperation op,
+          typename Input,
+          typename = std::enable_if_t<std::is_same<Input, int8_t>::value || std::is_same<Input, uint8_t>::value>>
+inline int elementwise_comp_op_quantized_broadcast_loop(int           window_start_x,
+                                                        int           window_end_x,
+                                                        int           window_step_x,
+                                                        const Input  *non_broadcast_input_ptr,
+                                                        float32x4x4_t broadcast_vector,
+                                                        uint8_t      *output_ptr,
+                                                        int32x4_t     voffset_non_broadcast,
+                                                        float32x4_t   vscale_non_broadcast,
+                                                        float32x4_t   voffseto,
+                                                        float32x4_t   invvscaleo,
+                                                        bool          reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
@@ -848,43 +806,21 @@ inline int elementwise_comp_op_quantized_broadcast_loop(int            window_st
     return x;
 }
 
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int           window_start_x,
-                                                               int           window_end_x,
-                                                               int           window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr,
-                                                               float32x4x4_t broadcast_vector,
-                                                               uint8_t      *output_ptr,
-                                                               int32x4_t     voffset_non_broadcast,
-                                                               float32x4_t   vscale_non_broadcast,
-                                                               float32x4_t   voffseto,
-                                                               float32x4_t   invvscaleo,
-                                                               bool          reorder)
-{
-    ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
-    int x = window_start_x;
-    for (; x <= (window_end_x - window_step_x); x += window_step_x)
-    {
-        const float32x4x4_t af =
-            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t rf =
-            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
-        store_quantized(output_ptr + x, rf);
-    }
-    return x;
-}
-
+template <typename Input,
+          typename Output,
+          typename = std::enable_if_t<(std::is_same<Input, int8_t>::value || std::is_same<Input, uint8_t>::value) &&
+                                      (std::is_same<Output, int8_t>::value || std::is_same<Output, uint8_t>::value)>>
 inline void elementwise_op_quantized(const ITensor *in1,
                                      const ITensor *in2,
                                      ITensor       *out,
                                      const Window  &window,
-                                     uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                     Output (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
                                      int (*broadcast_func)(int,
                                                            int,
                                                            int,
-                                                           const uint8_t *,
+                                                           const Input *,
                                                            float32x4x4_t,
-                                                           uint8_t *,
+                                                           Output *,
                                                            int32x4_t,
                                                            float32x4_t,
                                                            float32x4_t,
@@ -893,9 +829,9 @@ inline void elementwise_op_quantized(const ITensor *in1,
                                      int (*neon_func)(int,
                                                       int,
                                                       int,
-                                                      const uint8_t *,
-                                                      const uint8_t *,
-                                                      uint8_t *,
+                                                      const Input *,
+                                                      const Input *,
+                                                      Output *,
                                                       int32x4_t,
                                                       int32x4_t,
                                                       float32x4_t,
@@ -903,277 +839,8 @@ inline void elementwise_op_quantized(const ITensor *in1,
                                                       float32x4_t,
                                                       float32x4_t))
 {
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if (is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-                const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
-
-                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
-                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
-                                          voffseto, invvscaleo, !is_broadcast_input_2);
-                for (; x < window_end_x; ++x)
-                {
-                    const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                    const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
-                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-                }
-            },
-            broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
-                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
-                for (; x < window_end_x; ++x)
-                {
-                    const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                    const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-                }
-            },
-            input1, input2, output);
-    }
-}
-
-inline void
-elementwise_comp_quantized_signed(const ITensor *in1,
-                                  const ITensor *in2,
-                                  ITensor       *out,
-                                  const Window  &window,
-                                  uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                  int (*broadcast_func)(int,
-                                                        int,
-                                                        int,
-                                                        const int8_t *,
-                                                        float32x4x4_t,
-                                                        uint8_t *,
-                                                        int32x4_t,
-                                                        float32x4_t,
-                                                        float32x4_t,
-                                                        float32x4_t,
-                                                        const bool),
-                                  int (*neon_func)(int,
-                                                   int,
-                                                   int,
-                                                   const int8_t *,
-                                                   const int8_t *,
-                                                   uint8_t *,
-                                                   int32x4_t,
-                                                   int32x4_t,
-                                                   float32x4_t,
-                                                   float32x4_t,
-                                                   float32x4_t,
-                                                   float32x4_t))
-{
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const int  window_step_x         = 16;
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
-    const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
-    const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
-    if (is_broadcast_across_x)
-    {
-        // Select the broadcast input on the X axis
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        const UniformQuantizationInfo broadcast_qinfo     = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
-        const int32x4_t   voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
-        const float32x4_t vscale_non_broadcast  = vdupq_n_f32(non_broadcast_qinfo.scale);
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
-                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
-                                          voffseto, invvscaleo, !is_broadcast_input_2);
-                for (; x < window_end_x; ++x)
-                {
-                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
-                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-                }
-            },
-            broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
-        // Input1 quantization info
-        const int32x4_t   voffset1 = vdupq_n_s32(input1_qinfo.offset);
-        const float32x4_t vscale1  = vdupq_n_f32(input1_qinfo.scale);
-
-        // Input2 quantization info
-        const int32x4_t   voffset2 = vdupq_n_s32(input2_qinfo.offset);
-        const float32x4_t vscale2  = vdupq_n_f32(input2_qinfo.scale);
+    using InputVector = wrapper::traits::neon_vector_t<Input, 16>;
 
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(
-            win,
-            [&](const Coordinates &)
-            {
-                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
-                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
-                for (; x < window_end_x; ++x)
-                {
-                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-                }
-            },
-            input1, input2, output);
-    }
-}
-
-inline void
-elementwise_op_quantized_signed(const ITensor *in1,
-                                const ITensor *in2,
-                                ITensor       *out,
-                                const Window  &window,
-                                int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                int (*broadcast_func)(int,
-                                                      int,
-                                                      int,
-                                                      const int8_t *,
-                                                      float32x4x4_t,
-                                                      int8_t *,
-                                                      int32x4_t,
-                                                      float32x4_t,
-                                                      float32x4_t,
-                                                      float32x4_t,
-                                                      const bool),
-                                int (*neon_func)(int,
-                                                 int,
-                                                 int,
-                                                 const int8_t *,
-                                                 const int8_t *,
-                                                 int8_t *,
-                                                 int32x4_t,
-                                                 int32x4_t,
-                                                 float32x4_t,
-                                                 float32x4_t,
-                                                 float32x4_t,
-                                                 float32x4_t))
-{
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
     Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
@@ -1218,19 +885,22 @@ elementwise_op_quantized_signed(const ITensor *in1,
             win,
             [&](const Coordinates &)
             {
-                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const Input *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<Output *>(output.ptr());
 
-                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+                const Input       broadcast_value = *reinterpret_cast<const Input *>(broadcast_input.ptr());
+                const InputVector broadcast_value_v =
+                    wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag{});
+                const float32x4x4_t broadcast_vector = vdequantize(broadcast_value_v, broadcast_qinfo);
 
                 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
                                           broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
                                           voffseto, invvscaleo, !is_broadcast_input_2);
                 for (; x < window_end_x; ++x)
                 {
-                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    const float afs   = Qasymm8QuantizationHelper<Input>::dequantize(*(non_broadcast_input_ptr + x),
+                                                                                     non_broadcast_qinfo);
+                    const float bfs   = Qasymm8QuantizationHelper<Input>::dequantize(broadcast_value, broadcast_qinfo);
                     *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
                                                        !is_broadcast_input_2 ? afs : bfs, output_qinfo);
                 }
@@ -1262,16 +932,16 @@ elementwise_op_quantized_signed(const ITensor *in1,
             win,
             [&](const Coordinates &)
             {
-                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const Input *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const Input *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<Output *>(output.ptr());
 
                 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
                                      voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
                 for (; x < window_end_x; ++x)
                 {
-                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    const float afs   = Qasymm8QuantizationHelper<Input>::dequantize(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = Qasymm8QuantizationHelper<Input>::dequantize(*(input2_ptr + x), input2_qinfo);
                     *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
                 }
             },
@@ -1282,33 +952,34 @@ elementwise_op_quantized_signed(const ITensor *in1,
 template <ArithmeticOperation op>
 void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
-                             &elementwise_arithm_op_quantized_broadcast_loop<op>,
-                             &elementwise_arithm_op_quantized_loop<op>);
+    elementwise_op_quantized<uint8_t, uint8_t>(in1, in2, out, window,
+                                               &elementwise_arithm_op_quantized_scalar<op, uint8_t>,
+                                               &elementwise_arithm_op_quantized_broadcast_loop<op, uint8_t, uint8_t>,
+                                               &elementwise_arithm_op_quantized_loop<op, uint8_t, uint8_t>);
 }
 
 template <ArithmeticOperation op>
 void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
-                                    &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
-                                    &elementwise_arithm_op_quantized_singed_loop<op>);
+    elementwise_op_quantized<int8_t, int8_t>(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op, int8_t>,
+                                             &elementwise_arithm_op_quantized_broadcast_loop<op, int8_t, int8_t>,
+                                             &elementwise_arithm_op_quantized_loop<op, int8_t, int8_t>);
 }
 
 template <ComparisonOperation op>
 void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                             &elementwise_comp_op_quantized_broadcast_loop<op>,
-                             &elementwise_comp_op_quantized_loop<op>);
+    elementwise_op_quantized<uint8_t, uint8_t>(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+                                               &elementwise_comp_op_quantized_broadcast_loop<op, uint8_t>,
+                                               &elementwise_comp_op_quantized_loop<op, uint8_t>);
 }
 
 template <ComparisonOperation op>
 void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
-                                      &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
-                                      &elementwise_comp_op_quantized_signed_loop<op>);
+    elementwise_op_quantized<int8_t, uint8_t>(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
+                                              &elementwise_comp_op_quantized_broadcast_loop<op, int8_t>,
+                                              &elementwise_comp_op_quantized_loop<op, int8_t>);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h
index 9954a7645e..ba7865cf43 100644
--- a/src/cpu/kernels/quantize/generic/neon/impl.h
+++ b/src/cpu/kernels/quantize/generic/neon/impl.h
@@ -77,6 +77,21 @@ inline vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, co
     return vquantize_signed(qv, qi);
 }
 
+template <typename quantized_type>
+inline vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformRequantizationInfo &qi);
+
+template <>
+inline vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformRequantizationInfo &qi)
+{
+    return vquantize(qv, qi);
+}
+
+template <>
+inline vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformRequantizationInfo &qi)
+{
+    return vquantize_signed(qv, qi);
+}
+
 template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
 inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
 {
@@ -239,12 +254,17 @@ void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
+    constexpr bool is_8bit_int = std::is_same<TIn, int8_t>::value || std::is_same<TIn, uint8_t>::value;
+
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    UniformRequantizationInfo     reqinfo(1.f, 0);
+
+    if (is_8bit_int)
     {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+        reqinfo = compute_requantization_scale_float_offset(uqinfo_in, uqinfo);
     }
+
 #ifdef __aarch64__
     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
 #else  //__aarch64__
@@ -267,12 +287,26 @@ void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window
             int x = window_start_x;
             for (; x <= (window_end_x - window_step); x += window_step)
             {
-                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+                if (is_8bit_int)
+                {
+                    wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), reqinfo));
+                }
+                else
+                {
+                    wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+                }
             }
             // Compute left-over elements
             for (; x < window_end_x; ++x)
             {
-                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+                if (is_8bit_int)
+                {
+                    output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], reqinfo, rounding_policy);
+                }
+                else
+                {
+                    output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+                }
             }
         },
         input, output);
diff --git a/src/cpu/kernels/reduction_layer/generic/neon/impl.h b/src/cpu/kernels/reduction_layer/generic/neon/impl.h
index 3fa821d3a4..0c4a7c70c0 100644
--- a/src/cpu/kernels/reduction_layer/generic/neon/impl.h
+++ b/src/cpu/kernels/reduction_layer/generic/neon/impl.h
@@ -873,8 +873,7 @@ struct RedOpX_quantized
 
                         if (op == ReductionOperation::MEAN_SUM)
                         {
-                            const int32_t resFinal = A * (static_cast<float>(res)) + B;
-
+                            const float resFinal                 = A * (static_cast<float>(res)) + B;
                             *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
                         }
                         else
@@ -1427,10 +1426,10 @@ struct RedOpYZW_quantized
                             vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
 
 #ifdef __aarch64__
-                            vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
-                            vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
-                            vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
-                            vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+                            vec_res_value1 = wrapper::vcvtn<PromotedType>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvtn<PromotedType>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvtn<PromotedType>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvtn<PromotedType>(vec_res_value4_f);
 #else  // defined(__aarch64__)
                             vec_res_value1    = wrapper::vcvt<PromotedType>(vec_res_value1_f);
                             vec_res_value2    = wrapper::vcvt<PromotedType>(vec_res_value2_f);
@@ -1584,8 +1583,8 @@ struct RedOpYZW_quantized
                         {
                         // Apply previously calculated coefficients (with rounding on aarch64)
 #ifdef __aarch64__
-                            const int32_t res =
-                                arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+                            const int32_t res = arm_compute::round(A * (static_cast<float>(res_value_q)) + B,
+                                                                   RoundingPolicy::TO_NEAREST_EVEN);
 #else  // defined(__aarch64__)
                             const int32_t res = A * (static_cast<float>(res_value_q)) + B;
 #endif // __aarch64__
diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
index f3d443f9aa..f449b4b9b8 100644
--- a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
@@ -85,7 +85,7 @@ void sme2_qasymm8_softmax_kernel_512VL( //
             //   * pn9: all-true for 32 bit values
             //   * pn8: all-true for 8-bit values
             //
-            //   * z0-z15 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values
+            //   * z0-z11, z20-z23 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values
 
             // Prepares all constant values
 
@@ -115,8 +115,12 @@ void sme2_qasymm8_softmax_kernel_512VL( //
             add x2, x2, #256
             .inst 0xa040c448 //ld1w    { z8.s - z11.s }, pn9/z, [x2]
             add x2, x2, #256
-            .inst 0xa040c44c //ld1w    { z12.s - z15.s }, pn9/z, [x2]
+            .inst 0xa040c454 //ld1w    { z20.s - z23.s }, pn9/z, [x2]
 
+            dup z24.b, #0
+            dup z25.b, #0
+            dup z26.b, #0
+            dup z27.b, #0
 
 loop_3_start%=:
             // for index_3 in shape_3 downto 1
@@ -156,8 +160,8 @@ loop_1_start%=:
 find_max_body_start%=:
             cmp x1, x13
             b.eq find_max_body_end%=
-            .inst 0xa0018374 // ld1b    { z20.b - z23.b }, pn8/z, [x27, x1]  z20-z23: x
-            .inst 0xc134b811 // umax    { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x)
+            .inst 0xa001836c // ld1b    { z12.b - z15.b }, pn8/z, [x27, x1]  z12-z15: x
+            .inst 0xc12cb811 // umax    { z16.b - z19.b }, { z16.b - z19.b }, { z12.b - z15.b } z16-z19: max_value = max(max_value, x)
             add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers.
             b find_max_body_start%=
 find_max_body_end%=:
@@ -181,12 +185,17 @@ find_max_leftover_end%=:
             dup z16.b, z16.b[0]
             uunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction
             uunpklo z16.s, z16.h
+            mov z12.d, z16.d // Save to z12, as z16 will be overwritten.
 
             mov x1, #0 // reset index
-            dup z25.s, #0
+            dup z28.s, #0
 
             mov x1, #0
+            dup z13.s, #-16
 
+            // ==================================================
+            // Step 2: Exponentiation and Summation
+            // ==================================================
 regularize_start%=:
             whilelo p1.b, x1, %x[length]
             b.none regularize_end%=
@@ -201,192 +210,147 @@ regularize_start%=:
             punpkhi  p5.h, p4.b
             punpklo  p4.h, p4.b
 
-            ld1b z17.b, p1/z, [x27, x1] //z17: input data
-
-            uunpklo z18.h, z17.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction
-            uunpkhi z19.h, z17.b
-
-            uunpklo z17.s, z18.h // z17 = low  low  input QASYMM8 values
-            uunpkhi z18.s, z18.h // z18 = low  high input QASYMM8 values
-
-            uunpkhi z20.s, z19.h // z20 = high high input QASYMM8 values
-            uunpklo z19.s, z19.h // z19 = high low  input QASYMM8 values
-
-            sub z17.s, z16.s, z17.s                                          // z12: x =  max_value - input_data
-            sub z18.s, z16.s, z18.s                                          // z13: x =  max_value - input_data
-            sub z19.s, z16.s, z19.s                                          // z14: x =  max_value - input_data
-            sub z20.s, z16.s, z20.s                                          // z15: x =  max_value - input_data
-
-            tbx z21.s, z0.s, z17.s  // Look-up entries 0-15 in the LUT.
-            tbx z22.s, z0.s, z18.s
-            tbx z23.s, z0.s, z19.s
-            tbx z24.s, z0.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z1.s, z17.s  // Look-up entries 16-31 in the LUT.
-            tbx z22.s, z1.s, z18.s
-            tbx z23.s, z1.s, z19.s
-            tbx z24.s, z1.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z2.s, z17.s  // Look-up entries 32-47 in the LUT.
-            tbx z22.s, z2.s, z18.s
-            tbx z23.s, z2.s, z19.s
-            tbx z24.s, z2.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z3.s, z17.s  // Look-up entries 48-63 in the LUT.
-            tbx z22.s, z3.s, z18.s
-            tbx z23.s, z3.s, z19.s
-            tbx z24.s, z3.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z4.s, z17.s  // Look-up entries 64-79 in the LUT.
-            tbx z22.s, z4.s, z18.s
-            tbx z23.s, z4.s, z19.s
-            tbx z24.s, z4.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z5.s, z17.s  // Look-up entries 80-95 in the LUT.
-            tbx z22.s, z5.s, z18.s
-            tbx z23.s, z5.s, z19.s
-            tbx z24.s, z5.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z6.s, z17.s  // Look-up entries 96-111 in the LUT.
-            tbx z22.s, z6.s, z18.s
-            tbx z23.s, z6.s, z19.s
-            tbx z24.s, z6.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z7.s, z17.s  // Look-up entries 112-127 in the LUT.
-            tbx z22.s, z7.s, z18.s
-            tbx z23.s, z7.s, z19.s
-            tbx z24.s, z7.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z8.s, z17.s  // Look-up entries 128-143 in the LUT.
-            tbx z22.s, z8.s, z18.s
-            tbx z23.s, z8.s, z19.s
-            tbx z24.s, z8.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z9.s, z17.s  // Look-up entries 144-159 in the LUT.
-            tbx z22.s, z9.s, z18.s
-            tbx z23.s, z9.s, z19.s
-            tbx z24.s, z9.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z10.s, z17.s  // Look-up entries 160-175 in the LUT.
-            tbx z22.s, z10.s, z18.s
-            tbx z23.s, z10.s, z19.s
-            tbx z24.s, z10.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z11.s, z17.s  // Look-up entries 176-191 in the LUT.
-            tbx z22.s, z11.s, z18.s
-            tbx z23.s, z11.s, z19.s
-            tbx z24.s, z11.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z12.s, z17.s  // Look-up entries 192-207 in the LUT.
-            tbx z22.s, z12.s, z18.s
-            tbx z23.s, z12.s, z19.s
-            tbx z24.s, z12.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z13.s, z17.s  // Look-up entries 208-223 in the LUT.
-            tbx z22.s, z13.s, z18.s
-            tbx z23.s, z13.s, z19.s
-            tbx z24.s, z13.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z14.s, z17.s  // Look-up entries 224-239 in the LUT.
-            tbx z22.s, z14.s, z18.s
-            tbx z23.s, z14.s, z19.s
-            tbx z24.s, z14.s, z20.s
-
-            sub z17.s, z17.s, #16
-            sub z18.s, z18.s, #16
-            sub z19.s, z19.s, #16
-            sub z20.s, z20.s, #16
-
-            tbx z21.s, z15.s, z17.s  // Look-up entries 240-255 in the LUT.
-            tbx z22.s, z15.s, z18.s
-            tbx z23.s, z15.s, z19.s
-            tbx z24.s, z15.s, z20.s
-
-
-            st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor
-            fadd z25.s, p2/m, z25.s, z21.s
+            ld1b z16.b, p1/z, [x27, x1] //z16: input data
+
+            uunpklo z17.h, z16.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction
+            uunpkhi z18.h, z16.b
+
+            uunpklo z16.s, z17.h // z16 = low  low  input QASYMM8 values
+            uunpkhi z17.s, z17.h // z17 = low  high input QASYMM8 values
+
+            uunpkhi z19.s, z18.h // z19 = high high input QASYMM8 values
+            uunpklo z18.s, z18.h // z18 = high low  input QASYMM8 values
+
+            sub z16.s, z12.s, z16.s                                          // z16: x =  max_value - input_data
+            sub z17.s, z12.s, z17.s                                          // z17: x =  max_value - input_data
+            sub z18.s, z12.s, z18.s                                          // z18: x =  max_value - input_data
+            sub z19.s, z12.s, z19.s                                          // z19: x =  max_value - input_data
+
+            tbx z24.s, z0.s, z16.s  // Look-up entries 0-15 in the LUT.
+            tbx z25.s, z0.s, z17.s
+            tbx z26.s, z0.s, z18.s
+            tbx z27.s, z0.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z1.s, z16.s  // Look-up entries 16-31 in the LUT.
+            tbx z25.s, z1.s, z17.s
+            tbx z26.s, z1.s, z18.s
+            tbx z27.s, z1.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z2.s, z16.s  // Look-up entries 32-47 in the LUT.
+            tbx z25.s, z2.s, z17.s
+            tbx z26.s, z2.s, z18.s
+            tbx z27.s, z2.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z3.s, z16.s  // Look-up entries 48-63 in the LUT.
+            tbx z25.s, z3.s, z17.s
+            tbx z26.s, z3.s, z18.s
+            tbx z27.s, z3.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z4.s, z16.s  // Look-up entries 64-79 in the LUT.
+            tbx z25.s, z4.s, z17.s
+            tbx z26.s, z4.s, z18.s
+            tbx z27.s, z4.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z5.s, z16.s  // Look-up entries 80-95 in the LUT.
+            tbx z25.s, z5.s, z17.s
+            tbx z26.s, z5.s, z18.s
+            tbx z27.s, z5.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z6.s, z16.s  // Look-up entries 96-111 in the LUT.
+            tbx z25.s, z6.s, z17.s
+            tbx z26.s, z6.s, z18.s
+            tbx z27.s, z6.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z7.s, z16.s  // Look-up entries 112-127 in the LUT.
+            tbx z25.s, z7.s, z17.s
+            tbx z26.s, z7.s, z18.s
+            tbx z27.s, z7.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z8.s, z16.s  // Look-up entries 128-143 in the LUT.
+            tbx z25.s, z8.s, z17.s
+            tbx z26.s, z8.s, z18.s
+            tbx z27.s, z8.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z9.s, z16.s  // Look-up entries 144-159 in the LUT.
+            tbx z25.s, z9.s, z17.s
+            tbx z26.s, z9.s, z18.s
+            tbx z27.s, z9.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z10.s, z16.s  // Look-up entries 160-175 in the LUT.
+            tbx z25.s, z10.s, z17.s
+            tbx z26.s, z10.s, z18.s
+            tbx z27.s, z10.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z11.s, z16.s  // Look-up entries 176-191 in the LUT.
+            tbx z25.s, z11.s, z17.s
+            tbx z26.s, z11.s, z18.s
+            tbx z27.s, z11.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z20.s, z16.s  // Look-up entries 192-207 in the LUT.
+            tbx z25.s, z20.s, z17.s
+            tbx z26.s, z20.s, z18.s
+            tbx z27.s, z20.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z21.s, z16.s  // Look-up entries 208-223 in the LUT.
+            tbx z25.s, z21.s, z17.s
+            tbx z26.s, z21.s, z18.s
+            tbx z27.s, z21.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z22.s, z16.s  // Look-up entries 224-239 in the LUT.
+            tbx z25.s, z22.s, z17.s
+            tbx z26.s, z22.s, z18.s
+            tbx z27.s, z22.s, z19.s
+
+            .inst 0xc1adab10 //    add {z16.s-z19.s}, {z16.s-z19.s}, z13.s
+
+            tbx z24.s, z23.s, z16.s  // Look-up entries 240-255 in the LUT.
+            tbx z25.s, z23.s, z17.s
+            tbx z26.s, z23.s, z18.s
+            tbx z27.s, z23.s, z19.s
+
+
+            st1w z24.s, p2, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor
+            fadd z28.s, p2/m, z28.s, z24.s
             add x1, x1, #16
 
-            st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor
-            fadd z25.s, p3/m, z25.s, z22.s
+            st1w z25.s, p3, [x29, x1, LSL #2]// z25 store exp(-scale*beta*x) into the tmp tensor
+            fadd z28.s, p3/m, z28.s, z25.s
             add x1, x1, #16
 
-            st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor
-            fadd z25.s, p4/m, z25.s, z23.s
+            st1w z26.s, p4, [x29, x1, LSL #2]// z26 store exp(-scale*beta*x) into the tmp tensor
+            fadd z28.s, p4/m, z28.s, z26.s
             add x1, x1, #16
 
-            st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor
-            fadd z25.s, p5/m, z25.s, z24.s
+            st1w z27.s, p5, [x29, x1, LSL #2]// z27 store exp(-scale*beta*x) into the tmp tensor
+            fadd z28.s, p5/m, z28.s, z27.s
             add x1, x1, #16
 
             b regularize_start%=
@@ -395,9 +359,9 @@ regularize_end%=:
             mov w9, 0x0000
             movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [0,255] integer range of QASYMM8
             dup z29.s, w9
-            faddv s25, p0, z25.s
-            fdiv s25, s29, s25
-            dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax.
+            faddv s28, p0, z28.s
+            fdiv s28, s29, s28
+            dup z28.s, z28.s[0] // z28: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax.
 
             // ==================================================
             // Step 3: Normalize
@@ -408,36 +372,36 @@ normalize_body_start%=:
             b.eq normalize_body_end%=
 
             mov x2, x1       // Preserve the index into x2 for the final store to dst.
-            .inst 0xa001c7b0 // ld1w    { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2]
+            .inst 0xa001c7ac // ld1w    { z12.s - z15.s }, pn9/z, [x29, x1, lsl #2]
             add x1, x1, #64
-            .inst 0xa001c7b4 // ld1w    { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2]
+            .inst 0xa001c7b0 // ld1w    { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2]
             add x1, x1, #64
 
-            // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
-            fmul z16.s, z25.s, z16.s
-            fmul z17.s, z25.s, z17.s
-            fmul z18.s, z25.s, z18.s
-            fmul z19.s, z25.s, z19.s
-            fmul z20.s, z25.s, z20.s
-            fmul z21.s, z25.s, z21.s
-            fmul z22.s, z25.s, z22.s
-            fmul z23.s, z25.s, z23.s
-
-            // z16-z23: convert the FP32 values from the tmp tensor to uint32.
+            // z12-z19: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z12.s, z28.s, z12.s
+            fmul z13.s, z28.s, z13.s
+            fmul z14.s, z28.s, z14.s
+            fmul z15.s, z28.s, z15.s
+            fmul z16.s, z28.s, z16.s
+            fmul z17.s, z28.s, z17.s
+            fmul z18.s, z28.s, z18.s
+            fmul z19.s, z28.s, z19.s
+
+            // z12-z19: convert the FP32 values from the tmp tensor to uint32.
+            fcvtzu z12.s, p0/m, z12.s
+            fcvtzu z13.s, p0/m, z13.s
+            fcvtzu z14.s, p0/m, z14.s
+            fcvtzu z15.s, p0/m, z15.s
             fcvtzu z16.s, p0/m, z16.s
             fcvtzu z17.s, p0/m, z17.s
             fcvtzu z18.s, p0/m, z18.s
             fcvtzu z19.s, p0/m, z19.s
-            fcvtzu z20.s, p0/m, z20.s
-            fcvtzu z21.s, p0/m, z21.s
-            fcvtzu z22.s, p0/m, z22.s
-            fcvtzu z23.s, p0/m, z23.s
 
-            // z16-z17: narrow the uint32 values into uint8 and saturate them.
-            .inst 0xc133e230 // uqcvt    z16.b, { z16.s - z19.s }
-            .inst 0xc133e2b1 // uqcvt    z17.b, { z20.s - z23.s }
+            // z12-z13: narrow the uint32 values into uint8 and saturate them.
+            .inst 0xc133e1ac // uqcvt    z12.b, { z12.s - z15.s }
+            .inst 0xc133e22d // uqcvt    z13.b, { z16.s - z19.s }
 
-            dup z20.s, z25.s[0] // Juggling the value to z20 as z25 will be overwritten by the load below
+            dup z16.s, z28.s[0] // Juggling the value to z16 as z28 will be overwritten by the load below
 
             .inst 0xa001c7b8 // ld1w    { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2]
             add x1, x1, #64
@@ -445,14 +409,14 @@ normalize_body_start%=:
             add x1, x1, #64
 
             // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
-            fmul z24.s, z20.s, z24.s
-            fmul z25.s, z20.s, z25.s
-            fmul z26.s, z20.s, z26.s
-            fmul z27.s, z20.s, z27.s
-            fmul z28.s, z20.s, z28.s
-            fmul z29.s, z20.s, z29.s
-            fmul z30.s, z20.s, z30.s
-            fmul z31.s, z20.s, z31.s
+            fmul z24.s, z16.s, z24.s
+            fmul z25.s, z16.s, z25.s
+            fmul z26.s, z16.s, z26.s
+            fmul z27.s, z16.s, z27.s
+            fmul z28.s, z16.s, z28.s
+            fmul z29.s, z16.s, z29.s
+            fmul z30.s, z16.s, z30.s
+            fmul z31.s, z16.s, z31.s
 
             // z24-z31: convert the FP32 values from the tmp tensor to uint32.
             fcvtzu z24.s, p0/m, z24.s
@@ -464,13 +428,13 @@ normalize_body_start%=:
             fcvtzu z30.s, p0/m, z30.s
             fcvtzu z31.s, p0/m, z31.s
 
-            // z18-z19: narrow the uint32 values into uint8 and saturate them.
-            .inst 0xc133e332 // uqcvt    z18.b, { z24.s - z27.s }
-            .inst 0xc133e3b3 // uqcvt    z19.b, { z28.s - z31.s }
+            // z14-z15: narrow the uint32 values into uint8 and saturate them.
+            .inst 0xc133e32e // uqcvt    z14.b, { z24.s - z27.s }
+            .inst 0xc133e3af // uqcvt    z15.b, { z28.s - z31.s }
 
-            .inst 0xa0228390 // st1b    { z16.b - z19.b }, pn8, [x28, x2]
+            .inst 0xa022838c // st1b    { z12.b - z15.b }, pn8, [x28, x2]
 
-            dup z25.s, z20.s[0] // Juggling the value back to z25 as z20 will be overwritten by the next iteration or z25 will be used below.
+            dup z28.s, z16.s[0] // Juggling the value back to z28 as z16 will be overwritten by the next iteration
 
 b normalize_body_start%=
 normalize_body_end%=:
@@ -491,32 +455,32 @@ normalize_leftover_start%=:
 
             mov x2, x1 // Preserve the index into x2 for the final store to dst.
 
-            // z20-z23: load exp(-scale*beta*x) from the tmp tensor
-            ld1w z20.s, p2/z, [x29, x1, LSL #2]
+            // z12-z15: load exp(-scale*beta*x) from the tmp tensor
+            ld1w z12.s, p2/z, [x29, x1, LSL #2]
             add x1, x1, #16
 
-            ld1w z21.s, p3/z, [x29, x1, LSL #2]
+            ld1w z13.s, p3/z, [x29, x1, LSL #2]
             add x1, x1, #16
 
-            ld1w z22.s, p4/z, [x29, x1, LSL #2]
+            ld1w z14.s, p4/z, [x29, x1, LSL #2]
             add x1, x1, #16
 
-            ld1w z23.s, p5/z, [x29, x1, LSL #2]
+            ld1w z15.s, p5/z, [x29, x1, LSL #2]
             add x1, x1, #16
 
-            // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
-            fmul z20.s, z25.s, z20.s
-            fmul z21.s, z25.s, z21.s
-            fmul z22.s, z25.s, z22.s
-            fmul z23.s, z25.s, z23.s
+            // z12-z15: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256.
+            fmul z12.s, z28.s, z12.s
+            fmul z13.s, z28.s, z13.s
+            fmul z14.s, z28.s, z14.s
+            fmul z15.s, z28.s, z15.s
 
-            // z20-23: convert the FP32 values from the tmp tensor to uint32.
-            fcvtzu z20.s, p0/m, z20.s
-            fcvtzu z21.s, p0/m, z21.s
-            fcvtzu z22.s, p0/m, z22.s
-            fcvtzu z23.s, p0/m, z23.s
+            // z12-z15: convert the FP32 values from the tmp tensor to uint32.
+            fcvtzu z12.s, p0/m, z12.s
+            fcvtzu z13.s, p0/m, z13.s
+            fcvtzu z14.s, p0/m, z14.s
+            fcvtzu z15.s, p0/m, z15.s
 
-            .inst 0xc133e2b3 // uqcvt    z19.b, { z20.s - z23.s }, narrow the uint32 values into uint8 and saturate them into z19.
+            .inst 0xc133e1b3 // uqcvt    z19.b, { z12.s - z15.s }, narrow the uint32 values into uint8 and saturate them into z19.
 
             st1b z19.b, p1, [x28, x2]
 
@@ -550,7 +514,7 @@ loop_3_end%=:
           [dst_stride_3] "r"(dst_strides[3]),                            //
           [length] "r"(shape[0])                                         //
         : "cc", "memory",                                                //
-          "p0", "p1", "p2", "p3", "p4",                                  //
+          "p0", "p1", "p2", "p3", "p4", "p5",                            //
           "x2", "x9", "x13",                                             //
           "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", //
           "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",                //
diff --git a/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp b/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp
new file mode 100644
index 0000000000..e0b85d91f3
--- /dev/null
+++ b/src/cpu/kernels/softmax/generic/sve/impl_bf16.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void sve_softmax_bf16(const ITensor *in,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      int            axis,
+                      const Window  &window,
+                      const void    *lut_ptr)
+{
+    ARM_COMPUTE_UNUSED(tmp);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(axis);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lut_ptr);
+    const auto lut_fp16_ptr = reinterpret_cast<const uint16_t *>(lut_ptr);
+
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator out_it(out, window);
+
+    const auto all_true_pg     = wrapper::svptrue<arm_compute::bfloat16>();
+    const auto all_true_pg_f32 = wrapper::svptrue<float32_t>();
+    const auto all_true_pg_u32 = wrapper::svptrue<uint32_t>();
+    const int  vec_count       = wrapper::svcnt<arm_compute::bfloat16>();
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        {
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const uint16_t *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<uint16_t *>(out_it.ptr()) + start_x;
+
+            /* Compute Max: unlike in the conventional Softmax, we subtract the maximum value in the axis from each input (both in numerator and denominator) to reduce overall magnitude while maintaining correctness of output */
+            float32_t max_val(std::numeric_limits<float32_t>::lowest());
+            {
+                auto vec_max = wrapper::svdup_n(support::cpp11::lowest<float32_t>());
+
+                int            x         = 0;
+                svbool_t       pg        = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                const svbool_t p_32_true = svptrue_b32();
+
+                svbool_t pg_u16      = wrapper::svwhilelt<uint16_t>(x, input_width);
+                svbool_t pg_f32_low  = svunpklo(pg_u16);
+                svbool_t pg_f32_high = svunpkhi(pg_u16);
+                do
+                {
+                    const svuint16_t current_value_bf16 = svld1(pg, in_ptr + x);
+
+                    svuint32_t current_value_u32_low  = svunpklo(current_value_bf16);
+                    svuint32_t current_value_u32_high = svunpkhi(current_value_bf16);
+
+                    current_value_u32_low  = svlsl_n_u32_z(p_32_true, current_value_u32_low, 16);
+                    current_value_u32_high = svlsl_n_u32_z(p_32_true, current_value_u32_high, 16);
+
+                    const svfloat32_t current_value_fp32_low  = svreinterpret_f32_u32(current_value_u32_low);
+                    const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high);
+
+                    vec_max = svmax_m(pg_f32_low, vec_max, current_value_fp32_low);
+                    vec_max = svmax_m(pg_f32_high, vec_max, current_value_fp32_high);
+
+                    x += vec_count;
+                    pg          = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                    pg_u16      = wrapper::svwhilelt<uint16_t>(x, input_width);
+                    pg_f32_low  = svunpklo(pg_u16);
+                    pg_f32_high = svunpkhi(pg_u16);
+                } while (svptest_any(all_true_pg, pg));
+
+                // Reduce vec to single max value
+                max_val = svmaxv(all_true_pg, vec_max);
+            }
+            float32_t sum(0.f);
+            {
+                /* Init sum to zero */
+                svfloat32_t       vec_sum = wrapper::svdup_n(static_cast<float32_t>(0));
+                const svfloat32_t vec_max = wrapper::svdup_n(max_val);
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+
+                svbool_t pg     = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                svbool_t pg_u16 = wrapper::svwhilelt<uint16_t>(x, input_width);
+
+                svbool_t pg_f32_low  = svunpklo(pg_u16);
+                svbool_t pg_f32_high = svunpkhi(pg_u16);
+
+                do
+                {
+                    const svuint16_t vec_elements = svld1(pg, in_ptr + x);
+
+                    svuint32_t current_value_u32_low  = svunpklo(vec_elements);
+                    svuint32_t current_value_u32_high = svunpkhi(vec_elements);
+
+                    current_value_u32_low  = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_low, 16);
+                    current_value_u32_high = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_high, 16);
+
+                    const svfloat32_t current_value_fp32_low  = svreinterpret_f32_u32(current_value_u32_low);
+                    const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high);
+
+                    /* The aforementioned (on line 71) subtraction to reduce magnitude below, effectively a division by the exponentiated maximum value in the current axis */
+                    svfloat32_t vec_subbed_low_fp32  = svsub_z(pg_f32_low, current_value_fp32_low, vec_max);
+                    svfloat32_t vec_subbed_high_fp32 = svsub_z(pg_f32_high, current_value_fp32_high, vec_max);
+
+                    const svuint16_t vec_subbed_low_uint16 = svreinterpret_u16_u32(
+                        svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(vec_subbed_low_fp32), 16));
+                    const svuint16_t vec_subbed_high_uint16 = svreinterpret_u16_u32(
+                        svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(vec_subbed_high_fp32), 16));
+
+                    // Use LUT to get x : e^x*b
+                    const svuint32_t loaded_exp_16bit_values_low = svld1uh_gather_index_u32(
+                        pg_f32_low, lut_fp16_ptr, svreinterpret_u32_u16(vec_subbed_low_uint16));
+                    const svuint32_t loaded_exp_16bit_values_high = svld1uh_gather_index_u32(
+                        pg_f32_high, lut_fp16_ptr, svreinterpret_u32_u16(vec_subbed_high_uint16));
+
+                    // Recombine LUT values
+                    const svuint16_t exp_bf16 = svuzp1(svreinterpret_u16_u32(loaded_exp_16bit_values_low),
+                                                       svreinterpret_u16_u32(loaded_exp_16bit_values_high));
+
+                    /* This store is not the final output value, the output tensor is used to store the numerator/dividend of the softmax operation for use in the final step
+                    as there are likely not enough registers for a whole axis' values */
+                    svst1(pg, out_ptr + x, exp_bf16);
+
+                    svuint32_t exp_u32_low  = svunpklo(exp_bf16);
+                    svuint32_t exp_u32_high = svunpkhi(exp_bf16);
+
+                    exp_u32_low  = svlsl_n_u32_z(all_true_pg_u32, exp_u32_low, 16);
+                    exp_u32_high = svlsl_n_u32_z(all_true_pg_u32, exp_u32_high, 16);
+
+                    const svfloat32_t exp_fp32_low  = svreinterpret_f32_u32(exp_u32_low);
+                    const svfloat32_t exp_fp32_high = svreinterpret_f32_u32(exp_u32_high);
+
+                    vec_sum = svadd_m(pg_f32_low, vec_sum, exp_fp32_low);
+                    vec_sum = svadd_m(pg_f32_high, vec_sum, exp_fp32_high);
+
+                    x += vec_count;
+                    pg     = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                    pg_u16 = wrapper::svwhilelt<uint16_t>(x, input_width);
+
+                    pg_f32_low  = svunpklo(pg_u16);
+                    pg_f32_high = svunpkhi(pg_u16);
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                sum = svaddv(all_true_pg_f32, vec_sum);
+                sum = float32_t(1) / sum;
+            }
+
+            /* Normalize exponentials */
+            {
+                /* Loop over row and compute softmax */
+                int      x           = 0;
+                svbool_t pg          = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                svbool_t pg_u16      = wrapper::svwhilelt<uint16_t>(x, input_width);
+                svbool_t pg_f32_low  = svunpklo(pg_u16);
+                svbool_t pg_f32_high = svunpkhi(pg_u16);
+
+                do
+                {
+                    const svuint16_t vec_in = svld1(pg, out_ptr + x);
+
+                    svuint32_t current_value_u32_low  = svunpklo(vec_in);
+                    svuint32_t current_value_u32_high = svunpkhi(vec_in);
+
+                    current_value_u32_low  = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_low, 16);
+                    current_value_u32_high = svlsl_n_u32_z(all_true_pg_u32, current_value_u32_high, 16);
+
+                    const svfloat32_t current_value_fp32_low  = svreinterpret_f32_u32(current_value_u32_low);
+                    const svfloat32_t current_value_fp32_high = svreinterpret_f32_u32(current_value_u32_high);
+
+                    const svfloat32_t normalized_value_fp32_low =
+                        svmul_z(pg_f32_low, current_value_fp32_low, wrapper::svdup_n(sum));
+                    const svfloat32_t normalized_value_fp32_high =
+                        svmul_z(pg_f32_high, current_value_fp32_high, wrapper::svdup_n(sum));
+
+                    const svuint16_t normalized_value_low_uint16 = svreinterpret_u16_u32(
+                        svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(normalized_value_fp32_low), 16));
+                    const svuint16_t normalized_value_high_uint16 = svreinterpret_u16_u32(
+                        svlsr_n_u32_z(all_true_pg_u32, svreinterpret_u32_f32(normalized_value_fp32_high), 16));
+
+                    const svuint16_t normalized_value_bf16 =
+                        svuzp1(normalized_value_low_uint16, normalized_value_high_uint16);
+
+                    svst1(pg, out_ptr + x, normalized_value_bf16);
+
+                    x += vec_count;
+                    pg          = wrapper::svwhilelt<arm_compute::bfloat16>(x, input_width);
+                    pg_u16      = wrapper::svwhilelt<uint16_t>(x, input_width);
+                    pg_f32_low  = svunpklo(pg_u16);
+                    pg_f32_high = svunpkhi(pg_u16);
+                } while (svptest_any(all_true_pg, pg));
+            }
+        },
+        in_it, out_it);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 9b11f1eaed..1e39581ef2 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -74,6 +74,18 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
 
 #endif // ARM_COMPUTE_ENABLE_SME2
 
+#ifdef ARM_COMPUTE_ENABLE_BF16
+
+void sve_softmax_bf16(const ITensor *in,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      int            axis,
+                      const Window  &window,
+                      const void    *lut_ptr);
+
+#endif // ARM_COMPUTE_ENABLE_BF16
+
 #undef DECLARE_SOFTMAX_KERNEL
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index 73c85f2dcd..2563270133 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -56,7 +57,10 @@ namespace cpu
 class CpuDirectConv2d : public ICpuOperator
 {
 public:
-    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager);
+    CpuDirectConv2d() : CpuDirectConv2d(MemoryManagerOnDemand::make_default())
+    {
+    }
     ~CpuDirectConv2d();
     /** Set the input, weights, biases and output tensors.
      *
@@ -112,4 +116,4 @@ class CpuDirectConv2d : public ICpuOperator
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV2D_H
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index 3ad1e09a14..3c2a435042 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
-#define ARM_COMPUTE_CPU_DIRECTCONV3D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H
+#define ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
@@ -30,6 +30,7 @@
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -54,7 +55,10 @@ namespace cpu
 class CpuDirectConv3d : public ICpuOperator
 {
 public:
-    CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager);
+    CpuDirectConv3d() : CpuDirectConv3d(MemoryManagerOnDemand::make_default())
+    {
+    }
     ~CpuDirectConv3d();
     /** Set the input, weights, biases and output tensor info.
      *
@@ -104,4 +108,4 @@ class CpuDirectConv3d : public ICpuOperator
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DIRECTCONV3D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUDIRECTCONV3D_H
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 0ea3c249df..f7af2d7fc2 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -361,6 +361,8 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
         "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.pretranspose_A(), "Matrix A already pretransposed is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.pretranspose_B(), "Matrix B already pretransposed is not supported");
 
     // When using accumulation(in place summation), for now, the only supported DataType for output is S32.
     if (gemm_info.accumulate())
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index acc620edc6..38f53cf53c 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -164,9 +164,13 @@ Status CpuMatMul::validate(const ITensorInfo         *lhs,
         arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use,
                                                                                rhs_to_use, nullptr, dst, gemm_info));
+
+        // Set gemm weights info to the one returned by has_opt_impl because the user query the kernel for the format to be set.
+        gemm_info.weight_format = expected_weight_format;
     }
 
-    cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info));
 
     return Status{};
 }
@@ -251,9 +255,11 @@ void CpuMatMul::configure(ITensorInfo               *lhs,
     {
         _gemm_info.weight_format                         = WeightFormat::ANY;
         arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY;
-        ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use,
-                                                                              &rhs_to_use, nullptr, dst, _gemm_info));
-        // Set gemm weights info to the one returned by has_opt_impl
+        Status ret = cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use, &rhs_to_use,
+                                                                nullptr, dst, _gemm_info);
+        ARM_COMPUTE_ERROR_THROW_ON(ret);
+
+        // Set gemm weights info to the one returned by has_opt_impl because the user query the kernel for the format to be set.
         _gemm_info.weight_format = expected_weight_format;
         // has_opt_impl may return a non fast math kernel, even if we requested one
         _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format);
@@ -264,6 +270,7 @@ void CpuMatMul::configure(ITensorInfo               *lhs,
     _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
                          _gemm_info); // c is nullptr as bias not supported in MatMul
 
+    ARM_COMPUTE_EXIT_ON_MSG(!_asm_glue->is_configured(), "Error in CpuGemmAssemblyDispatch configuration");
     // Specify memory requirements for intermediate tensors
     auto asm_mem_req = _asm_glue->workspace();
     // Specify memory required by gemm kernel
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index bbffcf55a3..2d8cfceb91 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -54,12 +54,18 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
         src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED,
-        DataType::QSYMM8_PER_CHANNEL, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-        DataType::F32, DataType::S64, DataType::U64);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
+        DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM16, DataType::QSYMM16, DataType::S16, DataType::U16,
+        DataType::U32, DataType::S32, DataType::F16, DataType::F32, DataType::S64, DataType::U64);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED,
+        DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM16, DataType::QSYMM16, DataType::S16, DataType::U16,
+        DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+    const DataType src_dtype = get_underlying_data_type(src->data_type());
+    const DataType dst_dtype = get_underlying_data_type(dst->data_type());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src_dtype == dst_dtype, "src and dst data types must be different");
 
     // Validate in case of configured dst
     if (dst->total_size() > 0)
@@ -83,6 +89,9 @@ void ClCastKernel::configure(const CLCompileContext &compile_context,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
+    const DataType src_dtype = src->data_type();
+    const DataType dst_dtype = dst->data_type();
+
     // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
     set_shape_if_empty(*dst, src->tensor_shape());
 
@@ -91,24 +100,24 @@ void ClCastKernel::configure(const CLCompileContext &compile_context,
     auto padding_info = get_padding_info({src, dst});
 
     // Get data sizes
-    const size_t src_size = data_size_from_type(src->data_type());
-    const size_t dst_size = data_size_from_type(dst->data_type());
+    const size_t src_size = data_size_from_type(src_dtype);
+    const size_t dst_size = data_size_from_type(dst_dtype);
 
     // Get number of elements to process per iterations
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
 
     // Set build options
+
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
                           support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src_dtype));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst_dtype));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
-                             "-DIS_DATA_TYPE_FLOAT");
-    build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
+    build_opts.add_option_if(is_data_type_float(src_dtype) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
+    build_opts.add_option_if(dst_dtype == DataType::QASYMM8 && is_data_type_quantized_per_channel(src_dtype),
+                             "-DQSYMM8_PER_CHANNEL_TO_QASYMM8");
 
     // Create kernel
     const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up";
@@ -128,7 +137,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context,
     // Set config_id for enabling LWS tuning
     _config_id = kernel_name;
     _config_id += "_";
-    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += lower_string(string_from_data_type(src_dtype));
     _config_id += "_";
     _config_id += support::cpp11::to_string(src->dimension(0));
     _config_id += "_";
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
index e8df420f67..a01e31559f 100644
--- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
@@ -80,56 +81,28 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const
     const int  input_width_x  = src->tensor_shape().x();
     const bool multi_access_x = (input_width_x / vec_size_x > 0);
 
-    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo        = dst->quantization_info().uniform();
     const DataType                output_data_type = dst->data_type();
 
-    float   scale_to_apply  = qinfo.scale;
-    int32_t offset_to_apply = qinfo.offset;
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
+
     if (is_data_type_quantized_asymmetric(src->data_type()))
     {
-        /*
-         * In case of requantization of a quantized input tensor to an output tensor with another quantization
-         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
-         * offset to apply.
-         *
-         * Assuming:
-         *   - q_i as input quantized value
-         *   - q_o as output quantized value
-         *   - z_i as input quantization offset value
-         *   - z_o as output quantization offset value
-         *   - s_i as input quantization scale value
-         *   - s_o as output quantization scale value
-         *   - z_n as new quantization offset value
-         *   - s_n as new quantization scale value
-         *
-         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
-         *
-         * We can rewrite the formula as:
-         *
-         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
-         *
-         * q_o = q_i / s_n + z_n
-         *
-         * Where:
-         *
-         * s_n = s_o / s_i
-         *
-         * z_n = - z_i * s_i / s_o + z_o
-         *
-         */
-        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
-        scale_to_apply /= qinfo_in.scale;
-        // In order to minimize flooring we convert the offset to a float,
-        // then compute the new offset in the float domain,
-        // finally we convert it back as int32_t
-        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
+        const UniformQuantizationInfo src_qinfo = src->quantization_info().uniform();
+
+        const UniformRequantizationInfo reqinfo = compute_requantization_scale_float_offset(src_qinfo, dst_qinfo);
+
+        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(reqinfo.scale));
+        build_opts.add_option("-DOFFSET=" + float_to_string_with_full_precision(reqinfo.offset));
+    }
+    else
+    {
+        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(dst_qinfo.offset));
     }
 
-    // Create kernel
-    CLBuildOptions build_opts;
-    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 8f26ef003d..7636736983 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,8 @@ void ClCast::configure(const ClCompileContext &compile_context,
                        ConvertPolicy           policy)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, policy));
+
     auto k = std::make_unique<kernels::ClCastKernel>();
     k->configure(compile_context, src, dst, policy);
     _kernel = std::move(k);
@@ -44,6 +46,12 @@ void ClCast::configure(const ClCompileContext &compile_context,
 
 Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
 {
+    // This operation mode is supported by ClCastKernel, however it has an unusual
+    // casting behavior, which is not like casting between Int8 & UInt8. Therefore,
+    // we do not expose this mode in the public api
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::QSYMM8_PER_CHANNEL &&
+                                dst->data_type() == DataType::QASYMM8);
+
     return kernels::ClCastKernel::validate(src, dst, policy);
 }
 } // namespace opencl
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
index 25d2293673..9469a8018e 100644
--- a/src/gpu/cl/operators/ClCast.h
+++ b/src/gpu/cl/operators/ClCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_CAST_H
-#define ARM_COMPUTE_CL_CAST_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLCAST_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLCAST_H
 
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -42,20 +42,11 @@ class ClCast : public IClOperator
      * Valid data layouts:
      * - All
      *
-     * Valid data type configurations:
-     * |src            |dst                                    |
-     * |:--------------|:--------------------------------------|
-     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
-     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
-     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
-     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
-     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
-     * |F16            | U8, S8, U16, S16, U32, F32            |
-     * |F32            | U8, S8, U16, S16, U32, F16            |
+     * For data type configurations supported, please have a look at @ref CLCast
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  src             The source tensor to convert.
+     * @param[out] dst             The destinatio tensor.
      * @param[in]  policy          Conversion policy.
      */
     void
@@ -70,4 +61,4 @@ class ClCast : public IClOperator
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_CAST_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLCAST_H
diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 5fa9ea47e9..f6924d703e 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2018, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,10 @@
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/ILifetimeManager.h"
 #include "arm_compute/runtime/IPoolManager.h"
+#include "arm_compute/runtime/PoolManager.h"
 
 #include <memory>
 
@@ -71,4 +73,13 @@ void MemoryManagerOnDemand::clear()
     ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
     _pool_mgr->clear_pools();
 }
+
+std::shared_ptr<MemoryManagerOnDemand> MemoryManagerOnDemand::make_default()
+{
+    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
+    auto pool_mgr     = std::make_shared<PoolManager>();
+    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
+
+    return mm;
+}
 } //namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index be451bcdeb..8a98437caf 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -136,6 +136,7 @@ void NEFullyConnectedLayer::prepare()
     if (!_impl->is_prepared)
     {
         allocate_tensors(_impl->aux_mem_req, _impl->workspace);
+        MemoryGroupResourceScope scope_mg(_impl->memory_group);
         _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index d26b819864..b64bbbe3ad 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -76,8 +76,9 @@ void NEGEMM::configure(const ITensor  *a,
 
     // Check if we need to reshape the matrix B only on the first run
     _impl->is_prepared = false;
-    _impl->original_b  = b;
-    _impl->op          = std::make_unique<cpu::CpuGemm>();
+    _impl->memory_group.mappings().clear();
+    _impl->original_b = b;
+    _impl->op         = std::make_unique<cpu::CpuGemm>();
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index b5cdd864ba..a104ce02d1 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -60,7 +60,8 @@ void NEGEMMConv2d::configure(
 
     _impl->weights     = weights;
     _impl->is_prepared = false;
-    _impl->op          = std::make_unique<OperatorType>();
+    _impl->memory_group.mappings().clear();
+    _impl->op = std::make_unique<OperatorType>();
 
     _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
                          info);
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 03df5115f0..ffc73f0bc0 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -70,8 +70,9 @@ void NEGEMMConvolutionLayer::configure(const ITensor             *input,
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _impl->is_prepared = false;
-    _impl->weights     = weights;
-    _impl->op          = std::make_unique<cpu::CpuGemmConv2d>();
+    _impl->memory_group.mappings().clear();
+    _impl->weights = weights;
+    _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
     _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
                          conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 6d172cef27..1c730bd031 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -24,17 +24,20 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/quantization/AsymmHelpers.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 
+#include <set>
+
 using namespace arm_compute::experimental;
 
 namespace arm_compute
@@ -49,6 +52,7 @@ struct NEGEMMLowpMatrixMultiplyCore::Impl
     IWeightsManager                                    *weights_manager{nullptr};
     MemoryRequirements                                  aux_mem_req{};
     WorkspaceData<Tensor>                               workspace_tensors{};
+    ActivationLayerInfo                                 act_info{};
     bool                                                is_prepared{false};
 };
 
@@ -74,8 +78,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(
     }
 
     _impl->is_prepared = false;
-    _impl->b           = b;
-    _impl->op          = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    _impl->memory_group.mappings().clear();
+    _impl->b  = b;
+    _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
     _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
                          gemm_info);
     _impl->run_pack          = {{TensorType::ACL_SRC_0, a},
@@ -84,6 +89,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(
                                 {TensorType::ACL_DST, output}};
     _impl->prep_pack         = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
     _impl->aux_mem_req       = _impl->op->workspace();
+    _impl->act_info          = gemm_info.activation_info();
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
                                                         _impl->prep_pack, /* allocate_now */ false);
 }
@@ -106,6 +112,11 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
 
 void NEGEMMLowpMatrixMultiplyCore::update_quantization_parameters()
 {
+    // Supported activations in GEMM
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
     auto src = _impl->run_pack.get_const_tensor(ACL_SRC_0);
     auto wei = _impl->run_pack.get_const_tensor(ACL_SRC_1);
     auto dst = _impl->run_pack.get_tensor(ACL_DST);
@@ -114,14 +125,23 @@ void NEGEMMLowpMatrixMultiplyCore::update_quantization_parameters()
     const QuantizationInfo wqinfo = wei->info()->quantization_info();
     const QuantizationInfo oqinfo = (dst->info()->total_size() == 0) ? iqinfo : dst->info()->quantization_info();
 
-    int32_t min_activation = 0;
-    int32_t max_activation = 0;
-    std::tie(min_activation, max_activation) =
-        quantization::get_quantized_asymmetric_output_min_max(wqinfo, ActivationLayerInfo(), wei->info()->data_type());
+    PixelValue     type_min{};
+    PixelValue     type_max{};
+    const DataType data_type     = src->info()->data_type();
+    std::tie(type_min, type_max) = get_min_max(data_type);
+    int32_t min_activation       = type_min.get<int32_t>();
+    int32_t max_activation       = type_max.get<int32_t>();
+
+    const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+    if (supported_acts.find(_impl->act_info.activation()) != supported_acts.end())
+    {
+        std::tie(min_activation, max_activation) =
+            get_quantized_activation_min_max(_impl->act_info, data_type, uoqinfo);
+    }
 
     GEMMLowpOutputStageInfo output_info;
     output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    output_info.gemmlowp_offset          = oqinfo.uniform().offset;
+    output_info.gemmlowp_offset          = uoqinfo.offset;
     output_info.gemmlowp_min_bound       = min_activation;
     output_info.gemmlowp_max_bound       = max_activation;
     output_info.is_quantized_per_channel = false;
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 31898bafc4..36ce7de262 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,16 +34,23 @@ namespace arm_compute
 {
 struct NEMatMul::Impl
 {
+    Impl(std::shared_ptr<IMemoryManager> memory_manager) : memory_group(memory_manager)
+    {
+    }
+
+    Impl(const Impl &)            = delete;
+    Impl &operator=(const Impl &) = delete;
+
     const ITensor                  *lhs{nullptr};
     const ITensor                  *rhs{nullptr};
     ITensor                        *output{nullptr};
     std::unique_ptr<cpu::CpuMatMul> op{nullptr};
-    MemoryGroup                     memory_group{};
+    MemoryGroup                     memory_group;
     WorkspaceData<Tensor>           workspace_tensors{};
     ITensorPack                     run_pack{};
 };
 
-NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
+NEMatMul::NEMatMul(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>(memory_manager))
 {
 }
 
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index b72aff577a..e3bfc1b2c2 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -69,7 +69,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor             *input,
                                            const ActivationLayerInfo &act_info,
                                            bool                       enable_fast_math)
 {
-    _impl->is_prepared      = false;
+    _impl->is_prepared = false;
+    _impl->memory_group.mappings().clear();
     _impl->original_weights = weights;
     _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
diff --git a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
index ec0557ff4e..adda460c96 100644
--- a/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
@@ -79,11 +79,30 @@ void CpuGemmAssemblyDispatch::configure(
 Status CpuGemmAssemblyDispatch::validate(
     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
 {
-    if (gemm_info.reinterpret_input_as_3d() != false || gemm_info.depth_output_gemm3d() != false ||
-        gemm_info.reshape_b_only_on_first_run() != true)
+    if (gemm_info.reinterpret_input_as_3d() || gemm_info.depth_output_gemm3d() ||
+        !gemm_info.reshape_b_only_on_first_run())
     {
-        return Status(ErrorCode::RUNTIME_ERROR);
+        return Status(ErrorCode::RUNTIME_ERROR, "unsupported arguments in gemm_info");
     }
+    bool a_data_type_ok = a->data_type() == DataType::F32 || a->data_type() == DataType::F16;
+    bool b_data_type_ok = b->data_type() == DataType::F32 || b->data_type() == DataType::F16;
+    bool c_data_type_ok = c == nullptr;
+    bool d_data_type_ok = d->data_type() == DataType::F32 || d->data_type() == DataType::F16;
+    bool bf16_ok        = ((a->data_type() == DataType::BFLOAT16 && b->data_type() == DataType::BFLOAT16) ||
+                    b->data_type() == DataType::BFLOAT16) &&
+                   (d->data_type() == DataType::BFLOAT16 || d->data_type() == DataType::F32);
+
+    bool fixed_format_dtype_ok =
+        (!gemm_info.fixed_format() ||
+         (a->data_type() == DataType::F32 && b->data_type() == DataType::F32 && d->data_type() == DataType::F32) ||
+         (a->data_type() == DataType::F16 && b->data_type() == DataType::F16 && d->data_type() == DataType::F16) ||
+         bf16_ok);
+
+    if (!((a_data_type_ok && b_data_type_ok && c_data_type_ok && d_data_type_ok && fixed_format_dtype_ok) || bf16_ok))
+    {
+        return Status(ErrorCode::RUNTIME_ERROR, "datatype is not supported");
+    }
+
     return cpu::CpuGemmAssemblyDispatch::validate(a, b, c, d, init_assembly_metadata(gemm_info));
 }
 
diff --git a/src/runtime/experimental/operators/CpuDequantize.cpp b/src/runtime/experimental/operators/CpuDequantize.cpp
new file mode 100644
index 0000000000..08592d54d3
--- /dev/null
+++ b/src/runtime/experimental/operators/CpuDequantize.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuDequantize.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/experimental/operators/CpuDequantize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+struct CpuDequantize::Impl
+{
+    std::unique_ptr<cpu::CpuDequantize> op{nullptr};
+};
+
+CpuDequantize::CpuDequantize() : impl_(std::make_unique<Impl>())
+{
+}
+CpuDequantize::~CpuDequantize() = default;
+
+void CpuDequantize::configure(const ITensorInfo *input, ITensorInfo *output)
+{
+    impl_->op = std::make_unique<cpu::CpuDequantize>();
+    impl_->op->configure(input, output);
+}
+
+Status CpuDequantize::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return cpu::CpuDequantize::validate(input, output);
+}
+
+void CpuDequantize::run(ITensorPack &tensors)
+{
+    impl_->op->run(tensors);
+}
+
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/runtime/experimental/operators/CpuGEMMLowp.cpp b/src/runtime/experimental/operators/CpuGEMMLowp.cpp
new file mode 100644
index 0000000000..57391fbca9
--- /dev/null
+++ b/src/runtime/experimental/operators/CpuGEMMLowp.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/experimental/operators/CpuGEMMLowp.h"
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include "src/core/utils/quantization/AsymmHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+struct CpuGEMMLowp::Impl
+{
+    std::unique_ptr<arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
+    bool                                                             is_prepared{false};
+};
+
+CpuGEMMLowp::CpuGEMMLowp() : _impl(std::make_unique<Impl>())
+{
+    _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+}
+CpuGEMMLowp::~CpuGEMMLowp() = default;
+
+experimental::MemoryRequirements CpuGEMMLowp::workspace() const
+{
+    return _impl->op->workspace();
+}
+
+void CpuGEMMLowp::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Make the B matrix dynamic values.
+    auto b_info_to_use = b->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
+    {
+        b_info_to_use->set_are_values_constant(false);
+    }
+
+    _impl->is_prepared = false;
+    _impl->op->configure(a, b_info_to_use.get(), (c != nullptr ? c : nullptr), output, gemm_info);
+}
+
+Status CpuGEMMLowp::validate(const ITensorInfo *a,
+                             const ITensorInfo *b,
+                             const ITensorInfo *c,
+                             const ITensorInfo *output,
+                             const GEMMInfo    &gemm_info)
+{
+    // Make the B matrix dynamic values.
+    auto b_info_to_use = b->clone();
+    if (!gemm_info.reshape_b_only_on_first_run())
+    {
+        b_info_to_use->set_are_values_constant(false);
+    }
+
+    return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info);
+}
+
+void CpuGEMMLowp::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+    _impl->op->run(tensors);
+}
+
+void CpuGEMMLowp::prepare(ITensorPack &tensors)
+{
+    if (!_impl->is_prepared)
+    {
+        _impl->op->prepare(tensors);
+
+        auto aux_mem_req = _impl->op->workspace();
+
+        auto has_reshape =
+            std::find_if(aux_mem_req.begin(), aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+        if (has_reshape != std::end(aux_mem_req))
+        {
+            auto b = tensors.get_tensor(TensorType::ACL_SRC_1);
+            b->mark_as_unused();
+        }
+
+        _impl->is_prepared = true;
+    }
+}
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/runtime/experimental/operators/CpuGemmConv2d.cpp b/src/runtime/experimental/operators/CpuGemmConv2d.cpp
index 7253f6e0f1..3174dd30a8 100644
--- a/src/runtime/experimental/operators/CpuGemmConv2d.cpp
+++ b/src/runtime/experimental/operators/CpuGemmConv2d.cpp
@@ -90,6 +90,11 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
                                             weights_info, dilation, act_info, enable_fast_math);
 }
 
+void CpuGemmConv2d::update_quantization_parameters(ITensorPack &tensors)
+{
+    _impl->op->update_quantization_parameters(tensors);
+}
+
 void CpuGemmConv2d::run(ITensorPack &tensors)
 {
     prepare(tensors);
diff --git a/src/runtime/experimental/operators/CpuQuantize.cpp b/src/runtime/experimental/operators/CpuQuantize.cpp
new file mode 100644
index 0000000000..59a65d3611
--- /dev/null
+++ b/src/runtime/experimental/operators/CpuQuantize.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuQuantize.h"
+
+#include "arm_compute/runtime/experimental/operators/CpuQuantize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+struct CpuQuantize::Impl
+{
+    std::unique_ptr<cpu::CpuQuantize> op{nullptr};
+};
+
+CpuQuantize::CpuQuantize() : impl_(std::make_unique<Impl>())
+{
+}
+CpuQuantize::~CpuQuantize() = default;
+
+Status CpuQuantize::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return cpu::CpuQuantize::validate(input, output);
+}
+
+void CpuQuantize::configure(const ITensorInfo *input, ITensorInfo *output)
+{
+    impl_->op = std::make_unique<cpu::CpuQuantize>();
+    impl_->op->configure(input, output);
+}
+
+void CpuQuantize::run(ITensorPack &pack)
+{
+    impl_->op->run(pack);
+}
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
diff --git a/support/Bfloat16.h b/support/Bfloat16.h
index 7c5ef78848..d23cee5fe7 100644
--- a/support/Bfloat16.h
+++ b/support/Bfloat16.h
@@ -102,7 +102,7 @@ class bfloat16 final
      *
      * @param[in] v Floating-point value
      */
-    bfloat16(float v) : value(float_to_bf16(v))
+    explicit bfloat16(float v) : value(float_to_bf16(v))
     {
     }
     /** Constructor
diff --git a/support/SaturateCast.h b/support/SaturateCast.h
index 7af9f983ed..64a2157afe 100644
--- a/support/SaturateCast.h
+++ b/support/SaturateCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
-#define ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
+#ifndef ACL_SUPPORT_SATURATECAST_H
+#define ACL_SUPPORT_SATURATECAST_H
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/misc/Utility.h"
@@ -190,6 +190,18 @@ inline T saturate_cast(U v)
     return saturate_cast<T>(vi);
 }
 
+// float -> int
+template<typename T,
+         typename U,
+         typename std::enable_if<std::is_integral<T>::value &&
+                                 traits::is_floating_point<U>::value,
+                  int >::type = 0 >
+inline T saturate_static_cast(U v)
+{
+    int32_t vi = static_cast<int32_t>(v);
+    return saturate_cast<T>(vi);
+}
+
 // int -> float
 template<typename T,
          typename U,
@@ -216,4 +228,4 @@ inline T saturate_cast(U v)
 } // namespace cast
 } // namespace utils
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H */
+#endif // ACL_SUPPORT_SATURATECAST_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 20a010f38c..dcda7c3246 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -20,94 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+file(GLOB_RECURSE validation_reference "validation/reference/*.cpp")
+
 target_sources(
   arm_compute_validation_framework
   PRIVATE validation/Validation.cpp
           validation/Helpers.cpp
-          validation/reference/BoundingBoxTransform.cpp
-          validation/reference/GEMMReshapeRHSMatrix.cpp
-          validation/reference/ChannelShuffle.cpp
-          validation/reference/Logical.cpp
-          validation/reference/PoolingLayer.cpp
-          validation/reference/BitwiseNot.cpp
-          validation/reference/Conv3D.cpp
-          validation/reference/GEMMReshapeLHSMatrix.cpp
-          validation/reference/ComputeAllAnchors.cpp
-          validation/reference/DepthConcatenateLayer.cpp
-          validation/reference/TableLookup.cpp
-          validation/reference/ROIPoolingLayer.cpp
-          validation/reference/SliceOperations.cpp
-          validation/reference/GEMMLowp.cpp
-          validation/reference/Unstack.cpp
-          validation/reference/Pooling3dLayer.cpp
-          validation/reference/BitwiseOr.cpp
-          validation/reference/ReshapeLayer.cpp
-          validation/reference/SoftmaxLayer.cpp
-          validation/reference/Gather.cpp
-          validation/reference/Utils.cpp
-          validation/reference/Accumulate.cpp
-          validation/reference/CropResize.cpp
-          validation/reference/ReductionOperation.cpp
-          validation/reference/ConcatenateLayer.cpp
-          validation/reference/PixelWiseMultiplication.cpp
-          validation/reference/DepthConvertLayer.cpp
-          validation/reference/Erode.cpp
-          validation/reference/DepthToSpaceLayer.cpp
-          validation/reference/PadLayer.cpp
-          validation/reference/MeanStdDevNormalizationLayer.cpp
-          validation/reference/BitwiseXor.cpp
-          validation/reference/GEMM.cpp
-          validation/reference/NormalizePlanarYUVLayer.cpp
-          validation/reference/FuseBatchNormalization.cpp
-          validation/reference/BitwiseAnd.cpp
-          validation/reference/SpaceToDepth.cpp
-          validation/reference/NonMaximaSuppression.cpp
-          validation/reference/Reverse.cpp
-          validation/reference/DFT.cpp
-          validation/reference/L2NormalizeLayer.cpp
-          validation/reference/ActivationLayer.cpp
-          validation/reference/SpaceToBatch.cpp
-          validation/reference/Im2Col.cpp
-          validation/reference/DequantizationLayer.cpp
-          validation/reference/DeconvolutionLayer.cpp
-          validation/reference/MinMaxLocation.cpp
-          validation/reference/Select.cpp
-          validation/reference/BatchNormalizationLayer.cpp
-          validation/reference/InstanceNormalizationLayer.cpp
-          validation/reference/ROIAlignLayer.cpp
-          validation/reference/ElementwiseUnary.cpp
-          validation/reference/MeanStdDev.cpp
-          validation/reference/QLSTMLayerNormalization.cpp
-          validation/reference/Col2Im.cpp
-          validation/reference/FlattenLayer.cpp
-          validation/reference/AbsoluteDifference.cpp
-          validation/reference/Transpose.cpp
-          validation/reference/StackLayer.cpp
-          validation/reference/NormalizationLayer.cpp
-          validation/reference/Copy.cpp
-          validation/reference/MaxUnpoolingLayer.cpp
-          validation/reference/Winograd.cpp
-          validation/reference/Permute.cpp
-          validation/reference/Comparisons.cpp
-          validation/reference/Tile.cpp
-          validation/reference/BatchToSpaceLayer.cpp
-          validation/reference/ElementwiseOperations.cpp
-          validation/reference/QuantizationLayer.cpp
-          validation/reference/NonMaxSuppression.cpp
-          validation/reference/WeightsReshape.cpp
-          validation/reference/ArithmeticOperations.cpp
-          validation/reference/ConvertFullyConnectedWeights.cpp
-          validation/reference/Floor.cpp
-          validation/reference/PriorBoxLayer.cpp
-          validation/reference/Scale.cpp
-          validation/reference/ScatterLayer.cpp
-          validation/reference/ReorgLayer.cpp
-          validation/reference/Range.cpp
-          validation/reference/ArithmeticDivision.cpp
-          validation/reference/DepthwiseConvolutionLayer.cpp
-          validation/reference/FullyConnectedLayer.cpp
-          validation/reference/ConvolutionLayer.cpp
-          validation/reference/Reorder.cpp
+          ${validation_reference}
           framework/Framework.cpp
           framework/Utils.cpp
           framework/Exceptions.cpp
diff --git a/tests/framework/instruments/hwc_names.hpp b/tests/framework/instruments/hwc_names.hpp
index c39f3bba7a..766573ac04 100644
--- a/tests/framework/instruments/hwc_names.hpp
+++ b/tests/framework/instruments/hwc_names.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_HWC_NAMES
-#define ARM_COMPUTE_TEST_HWC_NAMES
+#ifndef ACL_TESTS_FRAMEWORK_INSTRUMENTS_HWC_NAMES_HPP
+#define ACL_TESTS_FRAMEWORK_INSTRUMENTS_HWC_NAMES_HPP
+
+#include <cstdint>
 
 #ifndef DOXYGEN_SKIP_THIS
 
@@ -3061,4 +3063,4 @@ enum
 
 #endif /* DOXYGEN_SKIP_THIS */
 
-#endif /* ARM_COMPUTE_TEST_HWC_NAMES */
+#endif // ACL_TESTS_FRAMEWORK_INSTRUMENTS_HWC_NAMES_HPP
diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index 2f943e84d8..46238b431a 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
 #include "arm_compute/runtime/CL/functions/CLCast.h"
 #include "tests/CL/CLAccessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
+#include "tests/datasets/DatatypeDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -35,12 +35,17 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/CastFixture.h"
 
+#include <map>
+#include <vector>
+
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 // Tolerance
@@ -49,100 +54,176 @@ constexpr AbsoluteTolerance<float> zero_tolerance(0);
 
 /** Input data sets **/
 // QASYMM8
-const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32));
-const auto CastQSYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QSYMM8), framework::dataset::make("DataType", DataType::F32));
+const auto CastQASYMM8toF32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F32));
+const auto CastQSYMM8toF32Dataset = combine(make("DataType", DataType::QSYMM8), make("DataType", DataType::F32));
+
+#define U8Types DataType::U8, DataType::QASYMM8
+#define S8Types DataType::S8, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM8_SIGNED
+#define S8Types_wo_q8_pc DataType::S8, DataType::QSYMM8, DataType::QASYMM8_SIGNED
+#define U16Types DataType::QASYMM16, DataType::U16
+#define S16Types DataType::S16, DataType::QSYMM16
 
 // U8
-const auto CastU8toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S8));
-const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
-const auto CastU8toS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-const auto CastU8toU32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U32));
-const auto CastU8toS32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32));
-const auto CastU8toF16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F16));
-const auto CastU8toF32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F32));
+const auto CastU8toS8Dataset  = combine(make("DataType", {U8Types}), make("DataType", {S8Types}));
+const auto CastU8toU16Dataset = combine(make("DataType", {U8Types}), make("DataType", {U16Types}));
+const auto CastU8toS16Dataset = combine(make("DataType", {U8Types}), make("DataType", {S16Types}));
+const auto CastU8toU32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::U32));
+const auto CastU8toS32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::S32));
+const auto CastU8toF16Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::F16));
+const auto CastU8toF32Dataset = combine(make("DataType", {U8Types}), make("DataType", DataType::F32));
 
 // S8
-const auto CastS8toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U8));
-const auto CastS8toU16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U16));
-const auto CastS8toS16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S16));
-const auto CastS8toU32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::U32));
-const auto CastS8toS32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::S32));
-const auto CastS8toF16Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F16));
-const auto CastS8toF32Dataset = combine(framework::dataset::make("DataType", DataType::S8), framework::dataset::make("DataType", DataType::F32));
+const auto CastS8toU8Dataset  = combine(make("DataType", {S8Types_wo_q8_pc}), make("DataType", {U8Types}));
+const auto CastQSYMM8_PER_CHANNELtoU8Dataset  = combine(make("DataType", DataType::QSYMM8_PER_CHANNEL), make("DataType", DataType::U8));
+const auto CastS8toU16Dataset = combine(make("DataType", {S8Types}), make("DataType", {U16Types}));
+const auto CastS8toS16Dataset = combine(make("DataType", {S8Types}), make("DataType", {S16Types}));
+const auto CastS8toU32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::U32));
+const auto CastS8toS32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::S32));
+const auto CastS8toF16Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::F16));
+const auto CastS8toF32Dataset = combine(make("DataType", {S8Types}), make("DataType", DataType::F32));
 
 // U16
-const auto CastU16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8));
-const auto CastU16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S8));
-const auto CastU16toS16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S16));
-const auto CastU16toU32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
-const auto CastU16toS32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::S32));
-const auto CastU16toF16Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F16));
-const auto CastU16toF32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::F32));
+const auto CastU16toU8Dataset  = combine(make("DataType", {U16Types}), make("DataType", {U8Types}));
+const auto CastU16toS8Dataset  = combine(make("DataType", {U16Types}), make("DataType", {S8Types}));
+const auto CastU16toS16Dataset = combine(make("DataType", {U16Types}), make("DataType", {S16Types}));
+const auto CastU16toU32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::U32));
+const auto CastU16toS32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::S32));
+const auto CastU16toF16Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::F16));
+const auto CastU16toF32Dataset = combine(make("DataType", {U16Types}), make("DataType", DataType::F32));
 
 // S16
-const auto CastS16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
-const auto CastS16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S8));
-const auto CastS16toU16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U16));
-const auto CastS16toU32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U32));
-const auto CastS16toS32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
-const auto CastS16toF16Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F16));
-const auto CastS16toF32Dataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::F32));
+const auto CastS16toU8Dataset  = combine(make("DataType", {S16Types}), make("DataType", {U8Types}));
+const auto CastS16toS8Dataset  = combine(make("DataType", {S16Types}), make("DataType", {S8Types}));
+const auto CastS16toU16Dataset = combine(make("DataType", {S16Types}), make("DataType", {U16Types}));
+const auto CastS16toU32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::U32));
+const auto CastS16toS32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::S32));
+const auto CastS16toF16Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::F16));
+const auto CastS16toF32Dataset = combine(make("DataType", {S16Types}), make("DataType", DataType::F32));
 
 // U32
-const auto CastU32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U8));
-const auto CastU32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S8));
-const auto CastU32toU16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::U16));
-const auto CastU32toS16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S16));
-const auto CastU32toS32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::S32));
-const auto CastU32toF16Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F16));
-const auto CastU32toF32Dataset = combine(framework::dataset::make("DataType", DataType::U32), framework::dataset::make("DataType", DataType::F32));
+const auto CastU32toU8Dataset  = combine(make("DataType", DataType::U32), make("DataType", {U8Types}));
+const auto CastU32toS8Dataset  = combine(make("DataType", DataType::U32), make("DataType", {S8Types}));
+const auto CastU32toU16Dataset = combine(make("DataType", DataType::U32), make("DataType", {U16Types}));
+const auto CastU32toS16Dataset = combine(make("DataType", DataType::U32), make("DataType", {S16Types}));
+const auto CastU32toS32Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::S32));
+const auto CastU32toF16Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::F16));
+const auto CastU32toF32Dataset = combine(make("DataType", DataType::U32), make("DataType", DataType::F32));
 
 // S32
-const auto CastS32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
-const auto CastS32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S8));
-const auto CastS32toU16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U16));
-const auto CastS32toS16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::S16));
-const auto CastS32toU32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U32));
-const auto CastS32toF16Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
-const auto CastS32toF32Dataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
+const auto CastS32toU8Dataset  = combine(make("DataType", DataType::S32), make("DataType", {U8Types}));
+const auto CastS32toS8Dataset  = combine(make("DataType", DataType::S32), make("DataType", {S8Types}));
+const auto CastS32toU16Dataset = combine(make("DataType", DataType::S32), make("DataType", {U16Types}));
+const auto CastS32toS16Dataset = combine(make("DataType", DataType::S32), make("DataType", {S16Types}));
+const auto CastS32toU32Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::U32));
+const auto CastS32toF16Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F16));
+const auto CastS32toF32Dataset = combine(make("DataType", DataType::S32), make("DataType", DataType::F32));
 
 // F16
-const auto CastF16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U8));
-const auto CastF16toS8Dataset  = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S8));
-const auto CastF16toU16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U16));
-const auto CastF16toS16Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S16));
-const auto CastF16toU32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::U32));
-const auto CastF16toS32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
-const auto CastF16toF32Dataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
+const auto CastF16toU8Dataset  = combine(make("DataType", DataType::F16), make("DataType", {U8Types}));
+const auto CastF16toS8Dataset  = combine(make("DataType", DataType::F16), make("DataType", {S8Types}));
+const auto CastF16toU16Dataset = combine(make("DataType", DataType::F16), make("DataType", {U16Types}));
+const auto CastF16toS16Dataset = combine(make("DataType", DataType::F16), make("DataType", {S16Types}));
+const auto CastF16toU32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::U32));
+const auto CastF16toS32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::S32));
+const auto CastF16toF32Dataset = combine(make("DataType", DataType::F16), make("DataType", DataType::F32));
 
 // F32
-const auto CastF32toU8Dataset  = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto CastF32toS8Dataset  = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S8));
-const auto CastF32toU16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U16));
-const auto CastF32toS16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S16));
-const auto CastF32toU32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U32));
-const auto CastF32toS32Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
-const auto CastF32toF16Dataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
+const auto CastF32toU8Dataset  = combine(make("DataType", DataType::F32), make("DataType", {U8Types}));
+const auto CastF32toS8Dataset  = combine(make("DataType", DataType::F32), make("DataType", {S8Types}));
+const auto CastF32toU16Dataset = combine(make("DataType", DataType::F32), make("DataType", {U16Types}));
+const auto CastF32toS16Dataset = combine(make("DataType", DataType::F32), make("DataType", {S16Types}));
+const auto CastF32toU32Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::U32));
+const auto CastF32toS32Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::S32));
+const auto CastF32toF16Dataset = combine(make("DataType", DataType::F32), make("DataType", DataType::F16));
 
 // U64
-const auto CastU64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U8));
-const auto CastU64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S8));
-const auto CastU64toU16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U16));
-const auto CastU64toS16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S16));
-const auto CastU64toU32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::U32));
-const auto CastU64toS32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::S32));
-const auto CastU64toF16Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F16));
-const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+const auto CastU64toU8Dataset  = combine(make("DataType", DataType::U64), make("DataType", {U8Types}));
+const auto CastU64toS8Dataset  = combine(make("DataType", DataType::U64), make("DataType", {S8Types}));
+const auto CastU64toU16Dataset = combine(make("DataType", DataType::U64), make("DataType", {U16Types}));
+const auto CastU64toS16Dataset = combine(make("DataType", DataType::U64), make("DataType", {S16Types}));
+const auto CastU64toU32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::U32));
+const auto CastU64toS32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::S32));
+const auto CastU64toF16Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F16));
+const auto CastU64toF32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F32));
 
 // S64
-const auto CastS64toU8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U8));
-const auto CastS64toS8Dataset  = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S8));
-const auto CastS64toU16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U16));
-const auto CastS64toS16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S16));
-const auto CastS64toU32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::U32));
-const auto CastS64toS32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::S32));
-const auto CastS64toF16Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F16));
-const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
+const auto CastS64toU8Dataset  = combine(make("DataType", DataType::S64), make("DataType", {U8Types}));
+const auto CastS64toS8Dataset  = combine(make("DataType", DataType::S64), make("DataType", {S8Types}));
+const auto CastS64toU16Dataset = combine(make("DataType", DataType::S64), make("DataType", {U16Types}));
+const auto CastS64toS16Dataset = combine(make("DataType", DataType::S64), make("DataType", {S16Types}));
+const auto CastS64toU32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::U32));
+const auto CastS64toS32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::S32));
+const auto CastS64toF16Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F16));
+const auto CastS64toF32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F32));
+
+void validate_data_types(DataType input_dtype, DataType output_dtype)
+{
+    const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype);
+    auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype);
+
+    const Status status = (CLCast::validate(&input, &output, ConvertPolicy::SATURATE));
+    const bool is_valid = static_cast<bool>(status);
+
+    static std::map<DataType, std::vector<DataType>> supported_dtypes;
+
+    supported_dtypes[DataType::U8] = {
+        S8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::S8] = {
+        U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::U16] = {
+        DataType::U8, DataType::S8, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32,
+        DataType::QSYMM8, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::QASYMM8_SIGNED, DataType::QSYMM16};
+
+    supported_dtypes[DataType::S16] = {
+        S8Types, U8Types, U16Types, DataType::U32, DataType::S32, DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::U32] = {
+        S8Types, U8Types, U16Types, S16Types, DataType::S32, DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::S32] = {
+        S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::U64] = {
+        S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32,
+        DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::S64] = {
+        S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32,
+        DataType::F16, DataType::F32};
+
+    supported_dtypes[DataType::F16] = {
+        S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F32};
+
+    supported_dtypes[DataType::F32] = {
+       S8Types, U8Types, U16Types, S16Types, DataType::U32, DataType::S32, DataType::F16};
+
+    supported_dtypes[DataType::QSYMM8] = supported_dtypes[DataType::S8];
+    supported_dtypes[DataType::QASYMM8_SIGNED] = supported_dtypes[DataType::S8];
+    supported_dtypes[DataType::QSYMM8_PER_CHANNEL] = {
+        U16Types, S16Types, DataType::U8, DataType::U32, DataType::S32, DataType::F16, DataType::F32
+    };
+
+    supported_dtypes[DataType::QASYMM8] = supported_dtypes[DataType::U8];
+
+    supported_dtypes[DataType::QSYMM16] = supported_dtypes[DataType::S16];
+    supported_dtypes[DataType::QASYMM16] = supported_dtypes[DataType::U16];
+
+    bool expected = false;
+    if(supported_dtypes.find(input_dtype) != supported_dtypes.end())
+    {
+        const auto supports = supported_dtypes[input_dtype];
+        expected = (std::find(supports.begin(), supports.end(), output_dtype) != supports.end());
+    }
+
+    ARM_COMPUTE_EXPECT_EQUAL(is_valid, expected, framework::LogLevel::ERRORS);
+
+    if(is_valid != expected)
+    {
+        std::cout << status.error_description() << std::endl;
+    }
+}
 } // namespace
 
 TEST_SUITE(CL)
@@ -164,7 +245,17 @@ using CLCastToF16Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T
 template <typename T>
 using CLCastToF32Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T, float>;
 
-#define CAST_SUITE(NAME, idt, odt, type, dataset, tolerance)                                                                     \
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL,
+    combine(
+        datasets::AllDataTypes("InputDataType"),
+        datasets::AllDataTypes("OutputDataType")),
+        input_dtype, output_dtype)
+{
+    validate_data_types(input_dtype, output_dtype);
+}
+
+
+#define CAST_SUITE(NAME, type, dataset, tolerance)                                                                     \
     TEST_SUITE(NAME)                                                                                                             \
     FIXTURE_DATA_TEST_CASE(RunSmall, type, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), dataset), \
                                                                                       datasets::ConvertPolicies()))              \
@@ -173,103 +264,97 @@ using CLCastToF32Fixture = CastValidationFixture<CLTensor, CLAccessor, CLCast, T
     }                                                                                                                            \
     TEST_SUITE_END()
 
-// QASYMM8
-CAST_SUITE(QASYMM8_to_F32, DataType::QASYMM8, DataType::F32, CLCastToF32Fixture<uint8_t>, CastQASYMM8toF32Dataset, zero_tolerance)
-// QSYMM8
-CAST_SUITE(QSYMM8_to_F32, DataType::QSYMM8, DataType::F32, CLCastToF32Fixture<int8_t>, CastQSYMM8toF32Dataset, zero_tolerance)
-
-
 // U8
-CAST_SUITE(U8_to_S8, DataType::U8, DataType::S8, CLCastToS8Fixture<uint8_t>, CastU8toS8Dataset, zero_tolerance)
-CAST_SUITE(U8_to_U16, DataType::U8, DataType::U16, CLCastToU16Fixture<uint8_t>, CastU8toU16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_S16, DataType::U8, DataType::S16, CLCastToS16Fixture<uint8_t>, CastU8toS16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_U32, DataType::U8, DataType::U32, CLCastToU32Fixture<uint8_t>, CastU8toU32Dataset, zero_tolerance)
-CAST_SUITE(U8_to_S32, DataType::U8, DataType::S32, CLCastToS32Fixture<uint8_t>, CastU8toS32Dataset, zero_tolerance)
-CAST_SUITE(U8_to_F16, DataType::U8, DataType::F16, CLCastToF16Fixture<uint8_t>, CastU8toF16Dataset, zero_tolerance)
-CAST_SUITE(U8_to_F32, DataType::U8, DataType::F32, CLCastToF32Fixture<uint8_t>, CastU8toF32Dataset, zero_tolerance)
+CAST_SUITE(U8_to_S8, CLCastToS8Fixture<uint8_t>, CastU8toS8Dataset, zero_tolerance)
+CAST_SUITE(U8_to_U16, CLCastToU16Fixture<uint8_t>, CastU8toU16Dataset, zero_tolerance)
+CAST_SUITE(U8_to_S16, CLCastToS16Fixture<uint8_t>, CastU8toS16Dataset, zero_tolerance)
+CAST_SUITE(U8_to_U32, CLCastToU32Fixture<uint8_t>, CastU8toU32Dataset, zero_tolerance)
+CAST_SUITE(U8_to_S32,  CLCastToS32Fixture<uint8_t>, CastU8toS32Dataset, zero_tolerance)
+CAST_SUITE(U8_to_F16, CLCastToF16Fixture<uint8_t>, CastU8toF16Dataset, zero_tolerance)
+CAST_SUITE(U8_to_F32, CLCastToF32Fixture<uint8_t>, CastU8toF32Dataset, zero_tolerance)
 
 // S8
-CAST_SUITE(S8_to_U8, DataType::S8, DataType::U8, CLCastToU8Fixture<int8_t>, CastS8toU8Dataset, zero_tolerance)
-CAST_SUITE(S8_to_U16, DataType::S8, DataType::U16, CLCastToU16Fixture<int8_t>, CastS8toU16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_S16, DataType::S8, DataType::S16, CLCastToS16Fixture<int8_t>, CastS8toS16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_U32, DataType::S8, DataType::U32, CLCastToU32Fixture<int8_t>, CastS8toU32Dataset, zero_tolerance)
-CAST_SUITE(S8_to_S32, DataType::S8, DataType::S32, CLCastToS32Fixture<int8_t>, CastS8toS32Dataset, zero_tolerance)
-CAST_SUITE(S8_to_F16, DataType::S8, DataType::F16, CLCastToF16Fixture<int8_t>, CastS8toF16Dataset, zero_tolerance)
-CAST_SUITE(S8_to_F32, DataType::S8, DataType::F32, CLCastToF32Fixture<int8_t>, CastS8toF32Dataset, zero_tolerance)
+CAST_SUITE(S8_to_U8, CLCastToU8Fixture<int8_t>, CastS8toU8Dataset, zero_tolerance)
+CAST_SUITE(S8_to_U16, CLCastToU16Fixture<int8_t>, CastS8toU16Dataset, zero_tolerance)
+CAST_SUITE(S8_to_S16, CLCastToS16Fixture<int8_t>, CastS8toS16Dataset, zero_tolerance)
+CAST_SUITE(S8_to_U32, CLCastToU32Fixture<int8_t>, CastS8toU32Dataset, zero_tolerance)
+CAST_SUITE(S8_to_S32, CLCastToS32Fixture<int8_t>, CastS8toS32Dataset, zero_tolerance)
+CAST_SUITE(S8_to_F16, CLCastToF16Fixture<int8_t>, CastS8toF16Dataset, zero_tolerance)
+CAST_SUITE(S8_to_F32, CLCastToF32Fixture<int8_t>, CastS8toF32Dataset, zero_tolerance)
 
 // U16
-CAST_SUITE(U16_to_U8, DataType::U16, DataType::U8, CLCastToU8Fixture<uint16_t>, CastU16toU8Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S8, DataType::U16, DataType::S8, CLCastToS8Fixture<uint16_t>, CastU16toS8Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S16, DataType::U16, DataType::S16, CLCastToS16Fixture<uint16_t>, CastU16toS16Dataset, zero_tolerance)
-CAST_SUITE(U16_to_U32, DataType::U16, DataType::U32, CLCastToU32Fixture<uint16_t>, CastU16toU32Dataset, zero_tolerance)
-CAST_SUITE(U16_to_S32, DataType::U16, DataType::S32, CLCastToS32Fixture<uint16_t>, CastU16toS32Dataset, zero_tolerance)
-CAST_SUITE(U16_to_F16, DataType::U16, DataType::F16, CLCastToF16Fixture<uint16_t>, CastU16toF16Dataset, zero_tolerance)
-CAST_SUITE(U16_to_F32, DataType::U16, DataType::F32, CLCastToF32Fixture<uint16_t>, CastU16toF32Dataset, zero_tolerance)
+CAST_SUITE(U16_to_U8, CLCastToU8Fixture<uint16_t>, CastU16toU8Dataset, zero_tolerance)
+CAST_SUITE(U16_to_S8, CLCastToS8Fixture<uint16_t>, CastU16toS8Dataset, zero_tolerance)
+CAST_SUITE(U16_to_S16, CLCastToS16Fixture<uint16_t>, CastU16toS16Dataset, zero_tolerance)
+CAST_SUITE(U16_to_U32, CLCastToU32Fixture<uint16_t>, CastU16toU32Dataset, zero_tolerance)
+CAST_SUITE(U16_to_S32, CLCastToS32Fixture<uint16_t>, CastU16toS32Dataset, zero_tolerance)
+CAST_SUITE(U16_to_F16, CLCastToF16Fixture<uint16_t>, CastU16toF16Dataset, zero_tolerance)
+CAST_SUITE(U16_to_F32, CLCastToF32Fixture<uint16_t>, CastU16toF32Dataset, zero_tolerance)
 
 // S16
-CAST_SUITE(S16_to_U8, DataType::S16, DataType::U8, CLCastToU8Fixture<int16_t>, CastS16toU8Dataset, zero_tolerance)
-CAST_SUITE(S16_to_S8, DataType::S16, DataType::S8, CLCastToS8Fixture<int16_t>, CastS16toS8Dataset, zero_tolerance)
-CAST_SUITE(S16_to_U16, DataType::S16, DataType::U16, CLCastToU16Fixture<int16_t>, CastS16toU16Dataset, zero_tolerance)
-CAST_SUITE(S16_to_U32, DataType::S16, DataType::U32, CLCastToU32Fixture<int16_t>, CastS16toU32Dataset, zero_tolerance)
-CAST_SUITE(S16_to_S32, DataType::S16, DataType::S32, CLCastToS32Fixture<int16_t>, CastS16toS32Dataset, zero_tolerance)
-CAST_SUITE(S16_to_F16, DataType::S16, DataType::F16, CLCastToF16Fixture<int16_t>, CastS16toF16Dataset, zero_tolerance)
-CAST_SUITE(S16_to_F32, DataType::S16, DataType::F32, CLCastToF32Fixture<int16_t>, CastS16toF32Dataset, zero_tolerance)
+CAST_SUITE(S16_to_U8, CLCastToU8Fixture<int16_t>, CastS16toU8Dataset, zero_tolerance)
+CAST_SUITE(S16_to_S8, CLCastToS8Fixture<int16_t>, CastS16toS8Dataset, zero_tolerance)
+CAST_SUITE(S16_to_U16, CLCastToU16Fixture<int16_t>, CastS16toU16Dataset, zero_tolerance)
+CAST_SUITE(S16_to_U32, CLCastToU32Fixture<int16_t>, CastS16toU32Dataset, zero_tolerance)
+CAST_SUITE(S16_to_S32, CLCastToS32Fixture<int16_t>, CastS16toS32Dataset, zero_tolerance)
+CAST_SUITE(S16_to_F16, CLCastToF16Fixture<int16_t>, CastS16toF16Dataset, zero_tolerance)
+CAST_SUITE(S16_to_F32, CLCastToF32Fixture<int16_t>, CastS16toF32Dataset, zero_tolerance)
 
 // U32
-CAST_SUITE(U32_to_U8, DataType::U32, DataType::U8, CLCastToU8Fixture<uint32_t>, CastU32toU8Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S8, DataType::U32, DataType::S8, CLCastToS8Fixture<uint32_t>, CastU32toS8Dataset, zero_tolerance)
-CAST_SUITE(U32_to_U16, DataType::U32, DataType::U16, CLCastToU16Fixture<uint32_t>, CastU32toU16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S16, DataType::U32, DataType::S16, CLCastToS16Fixture<uint32_t>, CastU32toS16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_S32, DataType::U32, DataType::S32, CLCastToS32Fixture<uint32_t>, CastU32toS32Dataset, zero_tolerance)
-CAST_SUITE(U32_to_F16, DataType::U32, DataType::F16, CLCastToF16Fixture<uint32_t>, CastU32toF16Dataset, zero_tolerance)
-CAST_SUITE(U32_to_F32, DataType::U32, DataType::F32, CLCastToF32Fixture<uint32_t>, CastU32toF32Dataset, zero_tolerance)
+CAST_SUITE(U32_to_U8, CLCastToU8Fixture<uint32_t>, CastU32toU8Dataset, zero_tolerance)
+CAST_SUITE(U32_to_S8, CLCastToS8Fixture<uint32_t>, CastU32toS8Dataset, zero_tolerance)
+CAST_SUITE(U32_to_U16, CLCastToU16Fixture<uint32_t>, CastU32toU16Dataset, zero_tolerance)
+CAST_SUITE(U32_to_S16, CLCastToS16Fixture<uint32_t>, CastU32toS16Dataset, zero_tolerance)
+CAST_SUITE(U32_to_S32, CLCastToS32Fixture<uint32_t>, CastU32toS32Dataset, zero_tolerance)
+CAST_SUITE(U32_to_F16, CLCastToF16Fixture<uint32_t>, CastU32toF16Dataset, zero_tolerance)
+CAST_SUITE(U32_to_F32, CLCastToF32Fixture<uint32_t>, CastU32toF32Dataset, zero_tolerance)
 
 // S32
-CAST_SUITE(S32_to_U8, DataType::S32, DataType::U8, CLCastToU8Fixture<int32_t>, CastS32toU8Dataset, zero_tolerance)
-CAST_SUITE(S32_to_S8, DataType::S32, DataType::S8, CLCastToS8Fixture<int32_t>, CastS32toS8Dataset, zero_tolerance)
-CAST_SUITE(S32_to_U16, DataType::S32, DataType::U16, CLCastToU16Fixture<int32_t>, CastS32toU16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_S16, DataType::S32, DataType::S16, CLCastToS16Fixture<int32_t>, CastS32toS16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_U32, DataType::S32, DataType::U32, CLCastToU32Fixture<int32_t>, CastS32toU32Dataset, zero_tolerance)
-CAST_SUITE(S32_to_F16, DataType::S32, DataType::F16, CLCastToF16Fixture<int32_t>, CastS32toF16Dataset, zero_tolerance)
-CAST_SUITE(S32_to_F32, DataType::S32, DataType::F32, CLCastToF32Fixture<int32_t>, CastS32toF32Dataset, zero_tolerance)
+CAST_SUITE(S32_to_U8, CLCastToU8Fixture<int32_t>, CastS32toU8Dataset, zero_tolerance)
+CAST_SUITE(S32_to_S8, CLCastToS8Fixture<int32_t>, CastS32toS8Dataset, zero_tolerance)
+CAST_SUITE(S32_to_U16, CLCastToU16Fixture<int32_t>, CastS32toU16Dataset, zero_tolerance)
+CAST_SUITE(S32_to_S16, CLCastToS16Fixture<int32_t>, CastS32toS16Dataset, zero_tolerance)
+CAST_SUITE(S32_to_U32, CLCastToU32Fixture<int32_t>, CastS32toU32Dataset, zero_tolerance)
+CAST_SUITE(S32_to_F16, CLCastToF16Fixture<int32_t>, CastS32toF16Dataset, zero_tolerance)
+CAST_SUITE(S32_to_F32, CLCastToF32Fixture<int32_t>, CastS32toF32Dataset, zero_tolerance)
 
 // F16
-CAST_SUITE(F16_to_U8, DataType::F16, DataType::U8, CLCastToU8Fixture<half>, CastF16toU8Dataset, one_tolerance)
-CAST_SUITE(F16_to_S8, DataType::F16, DataType::S8, CLCastToS8Fixture<half>, CastF16toS8Dataset, one_tolerance)
-CAST_SUITE(F16_to_U16, DataType::F16, DataType::U16, CLCastToU16Fixture<half>, CastF16toU16Dataset, one_tolerance)
-CAST_SUITE(F16_to_S16, DataType::F16, DataType::S16, CLCastToS16Fixture<half>, CastF16toS16Dataset, one_tolerance)
-CAST_SUITE(F16_to_U32, DataType::F16, DataType::U32, CLCastToU32Fixture<half>, CastF16toU32Dataset, one_tolerance)
-CAST_SUITE(F16_to_S32, DataType::F16, DataType::S32, CLCastToS32Fixture<half>, CastF16toS32Dataset, one_tolerance)
-CAST_SUITE(F16_to_F32, DataType::F16, DataType::F32, CLCastToF32Fixture<half>, CastF16toF32Dataset, zero_tolerance)
+CAST_SUITE(F16_to_U8, CLCastToU8Fixture<half>, CastF16toU8Dataset, one_tolerance)
+CAST_SUITE(F16_to_S8, CLCastToS8Fixture<half>, CastF16toS8Dataset, one_tolerance)
+CAST_SUITE(F16_to_U16, CLCastToU16Fixture<half>, CastF16toU16Dataset, one_tolerance)
+CAST_SUITE(F16_to_S16, CLCastToS16Fixture<half>, CastF16toS16Dataset, one_tolerance)
+CAST_SUITE(F16_to_U32, CLCastToU32Fixture<half>, CastF16toU32Dataset, one_tolerance)
+CAST_SUITE(F16_to_S32, CLCastToS32Fixture<half>, CastF16toS32Dataset, one_tolerance)
+CAST_SUITE(F16_to_F32, CLCastToF32Fixture<half>, CastF16toF32Dataset, zero_tolerance)
 
 // F32
-CAST_SUITE(F32_to_U8, DataType::F32, DataType::U8, CLCastToU8Fixture<float>, CastF32toU8Dataset, one_tolerance)
-CAST_SUITE(F32_to_S8, DataType::F32, DataType::S8, CLCastToS8Fixture<float>, CastF32toS8Dataset, one_tolerance)
-CAST_SUITE(F32_to_U16, DataType::F32, DataType::U16, CLCastToU16Fixture<float>, CastF32toU16Dataset, one_tolerance)
-CAST_SUITE(F32_to_S16, DataType::F32, DataType::S16, CLCastToS16Fixture<float>, CastF32toS16Dataset, one_tolerance)
-CAST_SUITE(F32_to_U32, DataType::F32, DataType::U32, CLCastToU32Fixture<float>, CastF32toU32Dataset, one_tolerance)
-CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, CLCastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
-CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, CLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
+CAST_SUITE(F32_to_U8, CLCastToU8Fixture<float>, CastF32toU8Dataset, one_tolerance)
+CAST_SUITE(F32_to_S8, CLCastToS8Fixture<float>, CastF32toS8Dataset, one_tolerance)
+CAST_SUITE(F32_to_U16, CLCastToU16Fixture<float>, CastF32toU16Dataset, one_tolerance)
+CAST_SUITE(F32_to_S16, CLCastToS16Fixture<float>, CastF32toS16Dataset, one_tolerance)
+CAST_SUITE(F32_to_U32, CLCastToU32Fixture<float>, CastF32toU32Dataset, one_tolerance)
+CAST_SUITE(F32_to_S32, CLCastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
+CAST_SUITE(F32_to_F16, CLCastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
 
 // S64
-CAST_SUITE(S64_to_U8, DataType::S64, DataType::U8, CLCastToU8Fixture<int64_t>, CastS64toU8Dataset, one_tolerance)
-CAST_SUITE(S64_to_S8, DataType::S64, DataType::S8, CLCastToS8Fixture<int64_t>, CastS64toS8Dataset, one_tolerance)
-CAST_SUITE(S64_to_U16, DataType::S64, DataType::U16, CLCastToU16Fixture<int64_t>, CastS64toU16Dataset, one_tolerance)
-CAST_SUITE(S64_to_S16, DataType::S64, DataType::S16, CLCastToS16Fixture<int64_t>, CastS64toS16Dataset, one_tolerance)
-CAST_SUITE(S64_to_U32, DataType::S64, DataType::U32, CLCastToU32Fixture<int64_t>, CastS64toU32Dataset, one_tolerance)
-CAST_SUITE(S64_to_S32, DataType::S64, DataType::S32, CLCastToS32Fixture<int64_t>, CastS64toS32Dataset, one_tolerance)
-CAST_SUITE(S64_to_F16, DataType::S64, DataType::F16, CLCastToF16Fixture<int64_t>, CastS64toF16Dataset, zero_tolerance)
-CAST_SUITE(S64_to_F32, DataType::S64, DataType::F32, CLCastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
+CAST_SUITE(S64_to_U8, CLCastToU8Fixture<int64_t>, CastS64toU8Dataset, one_tolerance)
+CAST_SUITE(S64_to_S8, CLCastToS8Fixture<int64_t>, CastS64toS8Dataset, one_tolerance)
+CAST_SUITE(S64_to_U16, CLCastToU16Fixture<int64_t>, CastS64toU16Dataset, one_tolerance)
+CAST_SUITE(S64_to_S16, CLCastToS16Fixture<int64_t>, CastS64toS16Dataset, one_tolerance)
+CAST_SUITE(S64_to_U32, CLCastToU32Fixture<int64_t>, CastS64toU32Dataset, one_tolerance)
+CAST_SUITE(S64_to_S32, CLCastToS32Fixture<int64_t>, CastS64toS32Dataset, one_tolerance)
+CAST_SUITE(S64_to_F16, CLCastToF16Fixture<int64_t>, CastS64toF16Dataset, zero_tolerance)
+CAST_SUITE(S64_to_F32, CLCastToF32Fixture<int64_t>, CastS64toF32Dataset, zero_tolerance)
 
 // U64
-CAST_SUITE(U64_to_U8, DataType::U64, DataType::U8, CLCastToU8Fixture<uint64_t>, CastU64toU8Dataset, one_tolerance)
-CAST_SUITE(U64_to_S8, DataType::U64, DataType::S8, CLCastToS8Fixture<uint64_t>, CastU64toS8Dataset, one_tolerance)
-CAST_SUITE(U64_to_U16, DataType::U64, DataType::U16, CLCastToU16Fixture<uint64_t>, CastU64toU16Dataset, one_tolerance)
-CAST_SUITE(U64_to_S16, DataType::U64, DataType::S16, CLCastToS16Fixture<uint64_t>, CastU64toS16Dataset, one_tolerance)
-CAST_SUITE(U64_to_U32, DataType::U64, DataType::U32, CLCastToU32Fixture<uint64_t>, CastU64toU32Dataset, one_tolerance)
-CAST_SUITE(U64_to_S32, DataType::U64, DataType::S32, CLCastToS32Fixture<uint64_t>, CastU64toS32Dataset, one_tolerance)
-CAST_SUITE(U64_to_F16, DataType::U64, DataType::F16, CLCastToF16Fixture<uint64_t>, CastU64toF16Dataset, zero_tolerance)
-CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, CLCastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
+CAST_SUITE(U64_to_U8, CLCastToU8Fixture<uint64_t>, CastU64toU8Dataset, one_tolerance)
+CAST_SUITE(U64_to_S8, CLCastToS8Fixture<uint64_t>, CastU64toS8Dataset, one_tolerance)
+CAST_SUITE(U64_to_U16, CLCastToU16Fixture<uint64_t>, CastU64toU16Dataset, one_tolerance)
+CAST_SUITE(U64_to_S16, CLCastToS16Fixture<uint64_t>, CastU64toS16Dataset, one_tolerance)
+CAST_SUITE(U64_to_U32, CLCastToU32Fixture<uint64_t>, CastU64toU32Dataset, one_tolerance)
+CAST_SUITE(U64_to_S32, CLCastToS32Fixture<uint64_t>, CastU64toS32Dataset, one_tolerance)
+CAST_SUITE(U64_to_F16, CLCastToF16Fixture<uint64_t>, CastU64toF16Dataset, zero_tolerance)
+CAST_SUITE(U64_to_F32, CLCastToF32Fixture<uint64_t>, CastU64toF32Dataset, zero_tolerance)
 
 TEST_SUITE_END() // Cast
 TEST_SUITE_END() // CL
diff --git a/tests/validation/CL/LogSoftmaxLayer.cpp b/tests/validation/CL/LogSoftmaxLayer.cpp
index b7f6a66e42..972d556ad2 100644
--- a/tests/validation/CL/LogSoftmaxLayer.cpp
+++ b/tests/validation/CL/LogSoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,14 +45,24 @@ namespace
 /** Tolerance for float operations */
 RelativeTolerance<half>  tolerance_f16(half(0.2));
 RelativeTolerance<float> tolerance_f32(0.001f);
+
+/** Tolerance for quantized operations */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1U);
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed(1);
+
 } // namespace
 
+using framework::dataset::make;
+
 TEST_SUITE(CL)
 TEST_SUITE(LogSoftmaxLayer)
 
 template <typename T>
 using CLLogSoftmaxLayerFixture = SoftmaxValidationFixture<CLTensor, CLAccessor, CLLogSoftmaxLayer, T, true>;
 
+template <typename T>
+using CLLogSoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<CLTensor, CLAccessor, CLLogSoftmaxLayer, T, true>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
@@ -108,6 +118,73 @@ FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::Datase
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -4, 3 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, 1 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::QASYMM8),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -4, 3 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // LogSoftmaxLayer
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/CL/QuantizationLayer.cpp b/tests/validation/CL/QuantizationLayer.cpp
index 335d8df293..25ac2f7d41 100644
--- a/tests/validation/CL/QuantizationLayer.cpp
+++ b/tests/validation/CL/QuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,14 +45,64 @@ namespace
 constexpr AbsoluteTolerance<float>    tolerance_f32(1.0f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
 constexpr AbsoluteTolerance<uint8_t>  tolerance_u8(1);     /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */
 constexpr AbsoluteTolerance<int8_t>   tolerance_s8(1);     /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED data types */
+constexpr AbsoluteTolerance<int8_t>   zero_tolerance_s8(0);
 constexpr AbsoluteTolerance<uint16_t> tolerance_u16(1);    /**< Tolerance value for comparing reference's output against implementation's output for QASYMM16 data types */
 const auto                            QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes());
 const auto                            QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes());
+
+void test_specific_case_int8(const std::vector<int8_t> &values, const std::vector<int8_t> &expected,
+    DataType dtype, const QuantizationInfo &in_qinfo, const QuantizationInfo &out_qinfo)
+{
+    // The test case here covers both Int8 and UInt8 because the underlying kernel is the same
+    const auto shape = TensorShape(values.size());
+
+    CLTensor input = create_tensor<CLTensor>(shape, dtype, 1, in_qinfo);
+    CLTensor output = create_tensor<CLTensor>(shape, dtype, 1, out_qinfo);
+
+    CLQuantizationLayer quant_layer;
+    quant_layer.configure(&input, &output);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    SimpleTensor<int8_t> ref {shape, dtype, 1, out_qinfo};
+
+    library->fill_static_values(CLAccessor(input), values);
+    library->fill_static_values(ref, expected);
+
+    quant_layer.run();
+
+    validate(CLAccessor(output), ref, zero_tolerance_s8);
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(QuantizationLayer)
 
+TEST_CASE(ProperlyRoundedRequantizationLt16Elements, framework::DatasetMode::ALL)
+{
+    std::vector<int8_t> values =   {1,3,5,7,9};
+    std::vector<int8_t> expected = {0,1,2,3,4}; // (x + 1)/2 - 1
+
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto in_qinfo = QuantizationInfo(0.5f, -1);
+    const auto out_qinfo = QuantizationInfo(1.f, -1);
+
+    test_specific_case_int8(values, expected, dtype, in_qinfo, out_qinfo);
+}
+
+TEST_CASE(ProperlyRoundedRequantizationGt16Elements, framework::DatasetMode::ALL)
+{
+    std::vector<int8_t> values =   {1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35};
+    std::vector<int8_t> expected = {0,1,2,3,4,5 ,6 ,7 ,8 ,9 ,10,11,12,13,14,15,16,17}; // (x + 1)/2 - 1
+
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto in_qinfo = QuantizationInfo(0.5f, -1);
+    const auto out_qinfo = QuantizationInfo(1.f, -1);
+
+    test_specific_case_int8(values, expected, dtype, in_qinfo, out_qinfo);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index 59cd4b0a88..56aafcad27 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -20,141 +20,22 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+file(GLOB_RECURSE files_validation_unit "UNIT/*.cpp")
+file(GLOB_RECURSE files_validation_cpp "CPP/*.cpp")
+
 target_sources(
   arm_compute_validation
-  PRIVATE UNIT/SafeIntegerOps.cpp
-          UNIT/Version.cpp
-          UNIT/TensorInfo.cpp
-          UNIT/TensorShape.cpp
-          UNIT/Utils.cpp
-          UNIT/SubTensorInfo.cpp
-          UNIT/WindowIterator.cpp
-          UNIT/LifetimeManager.cpp
-          UNIT/GPUTarget.cpp
-          CPP/DetectionPostProcessLayer.cpp
-          CPP/TopKV.cpp
-          CPP/DFT.cpp
-          CPP/Permute.cpp
-          CPP/NonMaximumSuppression.cpp)
+  PRIVATE ${files_validation_unit}
+          ${files_validation_cpp}
+  )
+
+file(GLOB_RECURSE files_validation_neon "NEON/*.cpp")
+file(GLOB_RECURSE files_validation_runtime "runtime/*.cpp")
 
 if(ENABLE_NEON)
   target_sources(
     arm_compute_validation
-    PRIVATE NEON/ElementwiseNegation.cpp
-            NEON/BoundingBoxTransform.cpp
-            NEON/ChannelShuffle.cpp
-            NEON/Logical.cpp
-            NEON/DilatedConvolutionLayer.cpp
-            NEON/PoolingLayer.cpp
-            NEON/BitwiseNot.cpp
-            NEON/FillBorder.cpp
-            NEON/ElementwiseRsqrtLayer.cpp
-            NEON/DepthConcatenateLayer.cpp
-            NEON/ElementwisePower.cpp
-            NEON/Fill.cpp
-            NEON/ROIPoolingLayer.cpp
-            NEON/LSTMLayer.cpp
-            NEON/ArithmeticSubtraction.cpp
-            NEON/GEMMLowp.cpp
-            NEON/Unstack.cpp
-            NEON/Slice.cpp
-            NEON/Pooling3dLayer.cpp
-            NEON/BitwiseOr.cpp
-            NEON/HeightConcatenateLayer.cpp
-            NEON/ReshapeLayer.cpp
-            NEON/SoftmaxLayer.cpp
-            NEON/Gather.cpp
-            NEON/CropResize.cpp
-            NEON/ReductionOperation.cpp
-            NEON/PixelWiseMultiplication.cpp
-            NEON/LogSoftmaxLayer.cpp
-            NEON/DepthConvertLayer.cpp
-            NEON/Flatten.cpp
-            NEON/ElementwiseKernelSelection.cpp
-            NEON/DepthToSpaceLayer.cpp
-            NEON/ElementwiseAbsoluteValue.cpp
-            NEON/PadLayer.cpp
-            NEON/MeanStdDevNormalizationLayer.cpp
-            NEON/GlobalPoolingLayer.cpp
-            NEON/RNNLayer.cpp
-            NEON/DetectionPostProcessLayer.cpp
-            NEON/ElementwiseRound.cpp
-            NEON/BitwiseXor.cpp
-            NEON/GEMM.cpp
-            NEON/FuseBatchNormalization.cpp
-            NEON/BitwiseAnd.cpp
-            NEON/ElementwiseMax.cpp
-            NEON/ReduceMean.cpp
-            NEON/Reverse.cpp
-            NEON/L2NormalizeLayer.cpp
-            NEON/Convolution3D.cpp
-            NEON/ArithmeticAddition.cpp
-            NEON/ActivationLayer.cpp
-            NEON/SpaceToBatchLayer.cpp
-            NEON/ElementwiseLog.cpp
-            NEON/LSTMLayerQuantized.cpp
-            NEON/Im2Col.cpp
-            NEON/DequantizationLayer.cpp
-            NEON/DeconvolutionLayer.cpp
-            NEON/Select.cpp
-            NEON/ElementwiseSin.cpp
-            NEON/PReluLayer.cpp
-            NEON/BatchNormalizationLayer.cpp
-            NEON/ElementwiseMin.cpp
-            NEON/InstanceNormalizationLayer.cpp
-            NEON/ROIAlignLayer.cpp
-            NEON/ElementwiseDivision.cpp
-            NEON/ElementwiseExpLayer.cpp
-            NEON/ArgMinMax.cpp
-            NEON/QLSTMLayerNormalization.cpp
-            NEON/Col2Im.cpp
-            NEON/Split.cpp
-            NEON/Transpose.cpp
-            NEON/GenerateProposalsLayer.cpp
-            NEON/StackLayer.cpp
-            NEON/WidthConcatenateLayer.cpp
-            NEON/NormalizationLayer.cpp
-            NEON/Copy.cpp
-            NEON/ElementwiseSquareDiff.cpp
-            NEON/MaxUnpoolingLayer.cpp
-            NEON/Permute.cpp
-            NEON/Comparisons.cpp
-            NEON/BatchConcatenateLayer.cpp
-            NEON/Tile.cpp
-            NEON/BatchToSpaceLayer.cpp
-            NEON/SpaceToDepthLayer.cpp
-            NEON/DepthwiseConvolutionLayerNative.cpp
-            NEON/QuantizationLayer.cpp
-            NEON/ConvertFullyConnectedWeights.cpp
-            NEON/Floor.cpp
-            NEON/FFT.cpp
-            NEON/Cast.cpp
-            NEON/PriorBoxLayer.cpp
-            NEON/Scale.cpp
-            NEON/ReorgLayer.cpp
-            NEON/Range.cpp
-            NEON/DirectConvolutionLayer.cpp
-            NEON/DepthwiseConvolutionLayer.cpp
-            NEON/FullyConnectedLayer.cpp
-            NEON/ConvolutionLayer.cpp
-            NEON/StridedSlice.cpp
-            NEON/ReorderLayer.cpp
-            NEON/UNIT/DynamicTensor.cpp
-            NEON/UNIT/TensorAllocator.cpp
-            NEON/UNIT/MemoryManager.cpp
-            NEON/UNIT/RuntimeContext.cpp
-            runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
-            runtime/experimental/operators/CpuActivation.cpp
-            runtime/experimental/operators/CpuAdd.cpp
-            runtime/experimental/operators/CpuDepthwiseConv2d.cpp
-            runtime/experimental/operators/CpuElementwise.cpp
-            runtime/experimental/operators/CpuGemm.cpp
-            runtime/experimental/operators/CpuGemmConv2d.cpp
-            runtime/experimental/operators/CpuGemmDirectConv2d.cpp
-            runtime/experimental/operators/CpuMul.cpp
-            runtime/experimental/operators/CpuSoftmax.cpp
-            runtime/experimental/operators/CpuSub.cpp
-            runtime/experimental/operators/CpuTranspose.cpp
-            runtime/experimental/operators/CpuWinogradConv2d.cpp
-            )
+    PRIVATE ${files_validation_neon}
+            ${files_validation_runtime}
+  )
 endif()
diff --git a/tests/validation/CPP/LUT.cpp b/tests/validation/CPP/LUT.cpp
index 1874823d8d..ab005e3ed6 100644
--- a/tests/validation/CPP/LUT.cpp
+++ b/tests/validation/CPP/LUT.cpp
@@ -26,6 +26,7 @@
 #include "tests/validation/Validation.h"
 #include "src/core/helpers/LUTManager.h"
 #include "include/half/half.hpp"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -85,7 +86,7 @@ TEST_SUITE(BF16)
 TEST_CASE(LUTValueTest, framework::DatasetMode::ALL)
 {
     // Define values for test
-    constexpr float beta = 1.0f;
+    constexpr float beta = -1.0f;
     constexpr float rel_tolerance = 0.01f;
     constexpr int num_elements = 65536;
     unsigned int num_mismatches = 0;
@@ -97,14 +98,14 @@ TEST_CASE(LUTValueTest, framework::DatasetMode::ALL)
     if(CPUInfo::get().has_fp16())
     {
         // Retrieve lut, Assert lut exists and is retrieved successfully.
-        std::shared_ptr<LookupTable65536> lut = lman.get_lut_table(info);
+        std::shared_ptr<LookupTable65536> lut = lman.get_lut_table<LookupTable65536>(info);
         ARM_COMPUTE_EXPECT(lut != nullptr, framework::LogLevel::ALL);
 
         // Check each value in lut
         for(int i=0; i < num_elements; i++)
         {
             // Calculate reference in fp32. Convert lut value to fp32.
-            const float fref = std::exp(bf16_to_float(i) * beta * -1);
+            const float fref = std::exp(bf16_to_float(i) * beta);
             const uint16_t target_bf16 = read_as_bf16((*lut)[i]);
             const float target = bf16_to_float(target_bf16);
 
@@ -133,11 +134,19 @@ TEST_CASE(LUTValueTest, framework::DatasetMode::ALL)
 
 TEST_CASE(CheckLutReuse, framework::DatasetMode::ALL)
 {
-    LUTInfo info = {LUTType::Exponential, 1.0f, DataType::BFLOAT16, UniformQuantizationInfo()};
-    LUTManager lman = LUTManager::get_instance();
-    auto first = lman.get_lut_table(info);
-    auto second = lman.get_lut_table(info);
-    ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS);
+    if (cpu_supports_dtypes({DataType::BFLOAT16}))
+    {
+        LUTInfo info = {LUTType::Exponential, -1.0f, DataType::BFLOAT16, UniformQuantizationInfo()};
+        LUTManager lman = LUTManager::get_instance();
+        auto first = lman.get_lut_table<LookupTable65536>(info);
+        auto second = lman.get_lut_table<LookupTable65536>(info);
+        ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support BFLOAT16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
 
diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index 668c60545b..7a4f767175 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp
@@ -35,14 +35,22 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/CastFixture.h"
+
+#include <cstdint>
+#include <vector>
+
 namespace arm_compute
 {
 namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 // Tolerance
@@ -56,60 +64,107 @@ constexpr AbsoluteTolerance<float> zero_tolerance(0);
 /** Input data sets **/
 
 // QASYMM8_SIGNED
-const auto CastQASYMM8_SIGNEDtoS16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::S16));
-const auto CastQASYMM8_SIGNEDtoS32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::S32));
-const auto CastQASYMM8_SIGNEDtoF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::F32));
-const auto CastQASYMM8_SIGNEDtoF16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::F16));
+const auto CastQASYMM8_SIGNEDtoS16Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::S16));
+const auto CastQASYMM8_SIGNEDtoS32Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::S32));
+const auto CastQASYMM8_SIGNEDtoF32Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::F32));
+const auto CastQASYMM8_SIGNEDtoF16Dataset = combine(make("DataType", DataType::QASYMM8_SIGNED), make("DataType", DataType::F16));
 
 // QASYMM8
-const auto CastQASYMM8toF16Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F16));
-const auto CastQASYMM8toF32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::F32));
-const auto CastQASYMM8toS32Dataset = combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::S32));
+const auto CastQASYMM8toF16Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F16));
+const auto CastQASYMM8toF32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::F32));
+const auto CastQASYMM8toS32Dataset = combine(make("DataType", DataType::QASYMM8), make("DataType", DataType::S32));
 
 // U8
-const auto CastU8toU16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U16));
-const auto CastU8toS16Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S16));
-const auto CastU8toS32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::S32));
-const auto CastU8toF32Dataset = combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::F32));
+const auto CastU8toU16Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::U16));
+const auto CastU8toS16Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::S16));
+const auto CastU8toS32Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::S32));
+const auto CastU8toF32Dataset = combine(make("DataType", DataType::U8), make("DataType", DataType::F32));
 
 // U16
-const auto CastU16toU8Dataset  = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U8));
-const auto CastU16toU32Dataset = combine(framework::dataset::make("DataType", DataType::U16), framework::dataset::make("DataType", DataType::U32));
+const auto CastU16toU8Dataset  = combine(make("DataType", DataType::U16), make("DataType", DataType::U8));
+const auto CastU16toU32Dataset = combine(make("DataType", DataType::U16), make("DataType", DataType::U32));
 
 // S16
-const auto CastS16toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
-const auto CastS16toU8Dataset             = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::U8));
-const auto CastS16toS32Dataset            = combine(framework::dataset::make("DataType", DataType::S16), framework::dataset::make("DataType", DataType::S32));
+const auto CastS16toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::S16), make("DataType", DataType::QASYMM8_SIGNED));
+const auto CastS16toU8Dataset             = combine(make("DataType", DataType::S16), make("DataType", DataType::U8));
+const auto CastS16toS32Dataset            = combine(make("DataType", DataType::S16), make("DataType", DataType::S32));
 
 //S32
-const auto CastS32toF16Dataset            = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F16));
-const auto CastS32toU8Dataset             = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::U8));
-const auto CastS32toF32Dataset            = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::F32));
-const auto CastS32toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto CastS32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::S32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+const auto CastS32toF16Dataset            = combine(make("DataType", DataType::S32), make("DataType", DataType::F16));
+const auto CastS32toU8Dataset             = combine(make("DataType", DataType::S32), make("DataType", DataType::U8));
+const auto CastS32toF32Dataset            = combine(make("DataType", DataType::S32), make("DataType", DataType::F32));
+const auto CastS32toQASYMM8Dataset        = combine(make("DataType", DataType::S32), make("DataType", DataType::QASYMM8));
+const auto CastS32toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::S32), make("DataType", DataType::QASYMM8_SIGNED));
 
 // F16
-const auto CastF16toF32Dataset            = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F32));
-const auto CastF16toS32Dataset            = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::S32));
-const auto CastF16toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto CastF16toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+const auto CastF16toF32Dataset            = combine(make("DataType", DataType::F16), make("DataType", DataType::F32));
+const auto CastF16toS32Dataset            = combine(make("DataType", DataType::F16), make("DataType", DataType::S32));
+const auto CastF16toQASYMM8Dataset        = combine(make("DataType", DataType::F16), make("DataType", DataType::QASYMM8));
+const auto CastF16toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::F16), make("DataType", DataType::QASYMM8_SIGNED));
 
 // F32
-const auto CastF32toU8Dataset             = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::U8));
-const auto CastF32toF16Dataset            = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::F16));
-const auto CastF32toS32Dataset            = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::S32));
-const auto CastF32toQASYMM8Dataset        = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8));
-const auto CastF32toQASYMM8_SIGNEDDataset = combine(framework::dataset::make("DataType", DataType::F32), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+const auto CastF32toU8Dataset             = combine(make("DataType", DataType::F32), make("DataType", DataType::U8));
+const auto CastF32toF16Dataset            = combine(make("DataType", DataType::F32), make("DataType", DataType::F16));
+const auto CastF32toS32Dataset            = combine(make("DataType", DataType::F32), make("DataType", DataType::S32));
+const auto CastF32toQASYMM8Dataset        = combine(make("DataType", DataType::F32), make("DataType", DataType::QASYMM8));
+const auto CastF32toQASYMM8_SIGNEDDataset = combine(make("DataType", DataType::F32), make("DataType", DataType::QASYMM8_SIGNED));
 
 // U64
-const auto CastU64toF32Dataset = combine(framework::dataset::make("DataType", DataType::U64), framework::dataset::make("DataType", DataType::F32));
+const auto CastU64toF32Dataset = combine(make("DataType", DataType::U64), make("DataType", DataType::F32));
 
 // S64
-const auto CastS64toF32Dataset = combine(framework::dataset::make("DataType", DataType::S64), framework::dataset::make("DataType", DataType::F32));
+const auto CastS64toF32Dataset = combine(make("DataType", DataType::S64), make("DataType", DataType::F32));
+
+template<typename T>
+void validate_static_cast(const TensorShape &shape, DataType src_dtype, DataType dst_dtype)
+{
+    Tensor input = create_tensor<Tensor>(shape, src_dtype, 1);
+    Tensor output = create_tensor<Tensor>(shape, dst_dtype, 1);
+
+    NECast cast;
+    cast.configure(&input, &output, ConvertPolicy::SATURATE);
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    library->fill_tensor_value(Accessor(input), 1.99f);
+    cast.run();
+
+    for(unsigned int i = 0; i < shape.x(); ++i)
+    {
+        const T ref = 1;
+        const T target = reinterpret_cast<T*>(output.buffer())[i];
+
+        ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS);
+    }
+}
+
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(Cast)
+
+// Validate casting truncates floats to integer instead of rounding
+DATA_TEST_CASE(ValidateStaticCastBehavior, framework::DatasetMode::ALL,
+    combine(
+        make("InputDataType", {DataType::F32, DataType::F16}),
+        make("OutputDataType", {DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8})),
+        src_dtype, dst_dtype)
+{
+    const auto shape = TensorShape(18U); // > 16 for channel dim. to stress vector and leftover loops
+
+    if(src_dtype == DataType::F32 || (src_dtype == DataType::F16 && cpu_supports_dtypes({DataType::F16})))
+    {
+        if(dst_dtype == DataType::QASYMM8_SIGNED)
+        {
+            validate_static_cast<int8_t>(shape, src_dtype, dst_dtype);
+        }
+        else
+        {
+            validate_static_cast<uint8_t>(shape, src_dtype, dst_dtype);
+        }
+    }
+}
+
 template <typename T>
 using NECastToU8Fixture = CastValidationFixture<Tensor, Accessor, NECast, T, uint8_t>;
 template <typename T>
@@ -206,7 +261,7 @@ CAST_SUITE(F32_to_QASYMM8, DataType::F32, DataType::QASYMM8, NECastToQASYMM8Fixt
 CAST_SUITE(F32_to_F16, DataType::F32, DataType::F16, NECastToF16Fixture<float>, CastF32toF16Dataset, zero_tolerance)
 #endif //  ARM_COMPUTE_ENABLE_FP16
 CAST_SUITE(F32_to_S32, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
-CAST_SUITE(F32_to_U8, DataType::F32, DataType::S32, NECastToS32Fixture<float>, CastF32toS32Dataset, one_tolerance)
+CAST_SUITE(F32_to_U8, DataType::F32, DataType::U8, NECastToU8Fixture<float>, CastF32toU8Dataset, one_tolerance)
 
 #ifdef __aarch64__
 // S64
@@ -217,8 +272,8 @@ CAST_SUITE(U64_to_F32, DataType::U64, DataType::F32, NECastToF32Fixture<uint64_t
 #endif // __aarch64__
 
 DATA_TEST_CASE(KernelSelectionDstFP16, framework::DatasetMode::ALL,
-               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                       framework::dataset::make("DataType",
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType",
 {
     DataType::F16,
     DataType::U8,
@@ -245,8 +300,8 @@ cpu_ext, data_type)
 }
 
 DATA_TEST_CASE(KernelSelectionSrcFP32, framework::DatasetMode::ALL,
-               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                       framework::dataset::make("DataType",
+               combine(make("CpuExt", std::string("NEON")),
+                       make("DataType",
 {
     DataType::F16,
 })),
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 01a16ebccb..e3e918b4cd 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -56,6 +56,13 @@ namespace
 } // namespace
 
 
+const auto QuantizedActivationFunctionsDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
 TEST_SUITE(NEON)
 TEST_SUITE(GEMMLowp)
 TEST_SUITE(MatrixMultiplyCore)
@@ -66,8 +73,6 @@ using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidation
 using NEGEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture = GEMMLowpMatrixMultiplyCoreDynamicQuantizationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 using NEGEMMLowpDequantizedMatrixMultiplyValidationFixture = GEMMLowpDequantizedMatrixMultiplyValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 
-using framework::dataset::make;
-
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
                shape_a, shape_b, shape_c, a_offset, b_offset)
 {
@@ -368,7 +373,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua
     combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
         make("DataType", { DataType::QASYMM8_SIGNED }),
         make("reshape_b_only_on_first_run", { false }),
-        make("updated_sq_info_after_config", { true })))
+        make("updated_sq_info_after_config", { true }),
+        QuantizedActivationFunctionsDataset
+        ))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
@@ -376,7 +383,9 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua
     combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
         make("DataType", { DataType::QASYMM8_SIGNED }),
         make("reshape_b_only_on_first_run", { false }),
-        make("updated_sq_info_after_config", { true })))
+        make("updated_sq_info_after_config", { true }),
+        QuantizedActivationFunctionsDataset
+        ))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
@@ -389,7 +398,9 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua
     combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
         make("DataType", { DataType::QASYMM8 }),
         make("reshape_b_only_on_first_run", { false }),
-        make("updated_sq_info_after_config", { true })))
+        make("updated_sq_info_after_config", { true }),
+        QuantizedActivationFunctionsDataset
+        ))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
@@ -397,7 +408,9 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreForUpdatedStaticQua
     combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
         make("DataType", { DataType::QASYMM8 }),
         make("reshape_b_only_on_first_run", { false }),
-        make("updated_sq_info_after_config", { true })))
+        make("updated_sq_info_after_config", { true }),
+        QuantizedActivationFunctionsDataset
+        ))
 {
     validate(Accessor(_target), _reference, tolerance_batched);
 }
diff --git a/tests/validation/NEON/MatMul.cpp b/tests/validation/NEON/MatMul.cpp
index ef79faba51..b75b94e32f 100644
--- a/tests/validation/NEON/MatMul.cpp
+++ b/tests/validation/NEON/MatMul.cpp
@@ -55,6 +55,7 @@ constexpr AbsoluteTolerance<int32_t> tolerance_qasymm8_signed(1);
 // clang-format off
 // *INDENT-OFF*
 // Validation Tests
+#ifdef __aarch64__
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
     zip(
         make("InputAInfo", {
@@ -108,6 +109,61 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
                                         CpuMatMulSettings());
     ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
 }
+#else // __aarch64__
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL,
+    zip(
+        make("InputAInfo", {
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Mismatching datatype
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::S32),        // Unsupported datatypes
+            TensorInfo(TensorShape(9U, 6U, 2U), 1, DataType::F32),    // Broadcasting in batch dimension not supported
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),        // Invalid shape for multiplication
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32),
+            TensorInfo(TensorShape(9U, 6U , 12U) , 1 , DataType::F32), // Tensors are not dynamic
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(9U, 6U), 1, DataType::QASYMM8_SIGNED), // Mismatching data type
+        }),
+        make("InputBInfo", {
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 9U, 1U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U, 12U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8), // MatMul of Qauntized Datatypes Not supported on armv7a
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 9U), 1, DataType::QASYMM8_SIGNED),
+        }),
+        make("OutputInfo", {
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::S32),
+            TensorInfo(TensorShape(5U, 6U, 2U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U, 12U) , 1, DataType::F32),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8_SIGNED),
+            TensorInfo(TensorShape(5U, 6U), 1, DataType::QASYMM8),
+        }),
+        make("TensorIsConst", {false, false, false, false, false , false, true, false, false, false}),
+        make("Expected", { false, false, false, false, true, true, false, false, false, false })),
+    a_info, b_info, output_info, are_tensors_const, expected)
+{
+    TensorInfo a{a_info};
+    TensorInfo b{b_info};
+    a.set_are_values_constant(are_tensors_const);
+    b.set_are_values_constant(are_tensors_const);
+    Status status =  NEMatMul::validate(&a,
+                                        &b,
+                                        &output_info,
+                                        MatMulInfo(),
+                                        CpuMatMulSettings());
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+#endif // __aarch64__
 // *INDENT-ON*
 // clang-format on
 
diff --git a/tests/validation/NEON/Permute.cpp b/tests/validation/NEON/Permute.cpp
index e9939105cd..5c51c7c032 100644
--- a/tests/validation/NEON/Permute.cpp
+++ b/tests/validation/NEON/Permute.cpp
@@ -31,6 +31,7 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/PermuteFixture.h"
 
@@ -179,14 +180,30 @@ TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEPermuteFixture<float16_t>, framework::DatasetMode::PRECOMMIT,
                        PermuteParametersSmall * framework::dataset::make("DataType", DataType::F16))
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    if (cpu_supports_dtypes({DataType::F16}))
+    {
+        // Validate output
+        validate(Accessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEPermuteFixture<float16_t>, framework::DatasetMode::NIGHTLY,
                        PermuteParametersLarge * framework::dataset::make("DataType", DataType::F16))
 {
-    // Validate output
-    validate(Accessor(_target), _reference);
+    if (cpu_supports_dtypes({DataType::F16}))
+    {
+        // Validate output
+        validate(Accessor(_target), _reference);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 TEST_SUITE_END()
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index fac5d73abd..da057c4c1f 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -34,6 +34,7 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/QuantizationLayerFixture.h"
 
+#include <vector>
 
 namespace arm_compute
 {
@@ -44,8 +45,11 @@ namespace validation
 namespace
 {
 /** Tolerance for quantization */
+/// @note: We do not expect any difference between our reference and target implementations for UInt8 and Int8
 constexpr AbsoluteTolerance<uint8_t>  tolerance_u8(1);  /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */
 constexpr AbsoluteTolerance<int8_t>   tolerance_s8(1);  /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8_SIGNED data types */
+constexpr AbsoluteTolerance<int8_t>   zero_tolerance_s8(0);
+
 constexpr AbsoluteTolerance<uint16_t> tolerance_u16(1); /**< Tolerance value for comparing reference's output against implementation's output for QASYMM16 data types */
 const auto                            QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes());
 const auto                            QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes());
@@ -54,6 +58,38 @@ const auto                            QuantizationLargeShapes = concat(datasets:
 TEST_SUITE(NEON)
 TEST_SUITE(QuantizationLayer)
 
+TEST_CASE(ProperlyRoundedRequantization, framework::DatasetMode::ALL)
+{
+    // The test case here covers both Int8 and UInt8 because the underlying kernel is the same
+    const auto shape = TensorShape(18U); // > 16 for channel dim. to stress vector and leftover loops
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto in_qinfo = QuantizationInfo(0.5f, -1);
+    const auto out_qinfo = QuantizationInfo(1.f, -1);
+
+    Tensor input = create_tensor<Tensor>(shape, dtype, 1, in_qinfo);
+    Tensor output = create_tensor<Tensor>(shape, dtype, 1, out_qinfo);
+
+    NEQuantizationLayer quant_layer;
+    quant_layer.configure(&input, &output);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values =   {1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35};
+    std::vector<int8_t> expected = {0,1,2,3,4,5 ,6 ,7 ,8 ,9 ,10,11,12,13,14,15,16,17}; // (x + 1)/2 - 1
+
+    SimpleTensor<int8_t> ref {shape, dtype, 1, out_qinfo};
+
+    ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS);
+
+    library->fill_static_values(Accessor(input), values);
+    library->fill_static_values(ref, expected);
+
+    quant_layer.run();
+
+    validate(Accessor(output), ref, zero_tolerance_s8);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index e5692693bd..05d09369c2 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -34,6 +34,10 @@
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ReduceMeanFixture.h"
 
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
 namespace arm_compute
 {
 namespace test
@@ -54,6 +58,9 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_u8(2);    /**< Tolerance value fo
 constexpr AbsoluteTolerance<int8_t>  tolerance_s8(2);    /**< Tolerance value for comparing reference's output against implementation's output for signed 8-bit asymmetric quantized type */
 #endif // __aarch64__
 
+constexpr AbsoluteTolerance<uint8_t> zero_tolerance_u8(0);
+constexpr AbsoluteTolerance<int8_t>  zero_tolerance_s8(0);
+
 const auto axis_keep = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1, 0), Coordinates(1, 2), Coordinates(0, 2), Coordinates(1, 3), Coordinates(2, 3), Coordinates(0, 1, 2, 3) }),
                                framework::dataset::make("KeepDims", { true }));
 const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0), Coordinates(1), Coordinates(3) }), framework::dataset::make("KeepDims", { false }));
@@ -61,6 +68,87 @@ const auto axis_drop = combine(framework::dataset::make("Axis", { Coordinates(0)
 TEST_SUITE(NEON)
 TEST_SUITE(ReduceMean)
 
+TEST_CASE(ProperRoundingPolicyXReduction, framework::DatasetMode::ALL)
+{
+    // We do not need to stress vector and leftover loops diffrently
+    // because the rounding is done scalarly at the end. Accumulation
+    // is done over integer types.
+    constexpr int x_len = 2;
+
+    const auto input_shape = TensorShape(x_len);
+    const auto output_shape = TensorShape(1);
+    const bool keep_dims = true;
+    const auto axis = Coordinates(0);
+    const auto input_qinfo = QuantizationInfo(2 / 255.f, 0);
+    const auto output_qinfo = QuantizationInfo(6 / 255.f, -1);
+    const auto dtype = DataType::QASYMM8_SIGNED;
+
+    Tensor input = create_tensor<Tensor>(input_shape, dtype, 1, input_qinfo);
+    Tensor output = create_tensor<Tensor>(output_shape, dtype, 1, output_qinfo);
+
+    NEReduceMean reduce_mean;
+    reduce_mean.configure(&input, axis, keep_dims, &output);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values {50, 26};
+    library->fill_static_values(Accessor(input), values);
+
+    std::vector<int8_t> expected {12};
+    SimpleTensor<int8_t> ref{ output_shape, dtype, 1, input_qinfo };
+    library->fill_static_values(ref, expected);
+
+    reduce_mean.run();
+
+    // The tolerance should be 0 because this test stresses the rounding behavior of the operator
+    validate(Accessor(output), ref, zero_tolerance_s8);
+}
+
+#ifdef __aarch64__
+// Due to the lack of instructions in a32, the rounding operation is less
+// accurate
+TEST_CASE(ProperRoundingPolicyNonXReduction, framework::DatasetMode::ALL)
+{
+    constexpr int x_len = 17; // > 16 to stress both vector and leftover loops
+
+    const auto input_shape = TensorShape(x_len, 2, 2, 1);
+    const auto output_shape = TensorShape(x_len, 1, 1, 1);
+    const bool keep_dims = true;
+    const auto axis = Coordinates(1, 2);
+    const auto input_qinfo = QuantizationInfo(2 / 255.f, 127);
+    const auto output_qinfo = QuantizationInfo(2 / 255.f, 127);
+    const auto dtype = DataType::QASYMM8;
+
+    Tensor input = create_tensor<Tensor>(input_shape, dtype, 1, input_qinfo);
+    Tensor output = create_tensor<Tensor>(output_shape, dtype, 1, output_qinfo);
+
+    NEReduceMean reduce_mean;
+    reduce_mean.configure(&input, axis, keep_dims, &output);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    // {139, 139 ... 139 (x_len times) 154, 154, ... 154 (x_len_times) ...}
+    std::vector<uint8_t> values;
+    fill_n(back_inserter(values), x_len, 139);
+    fill_n(back_inserter(values), x_len, 154);
+    fill_n(back_inserter(values), x_len, 164);
+    fill_n(back_inserter(values), x_len, 179);
+    library->fill_static_values(Accessor(input), values);
+
+    std::vector<uint8_t> expected;
+    fill_n(back_inserter(expected), x_len, 159); // 159 = (139 + 154 + 164 + 179) / 4
+    SimpleTensor<uint8_t> ref{ output_shape, dtype, 1, input_qinfo };
+    library->fill_static_values(ref, expected);
+
+    reduce_mean.run();
+
+    // The tolerance should be 0 because this test stresses the rounding behavior of the operator
+    validate(Accessor(output), ref, zero_tolerance_u8);
+}
+#endif // __aarch64__
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 55de2d6281..82e6ceaa71 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -28,6 +28,7 @@
 #include "tests/framework/Macros.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
+#include "utils/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -165,9 +166,23 @@ TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
     {
         const auto input  = TensorInfo{ input_shape, 1, kv.first, default_data_layout };
         const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout };
-
-        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
-        ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
+        if(cpu_supports_dtypes({kv.first}))
+        {
+            result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
+            ARM_COMPUTE_EXPECT_EQUAL(bool(result) , kv.second, framework::LogLevel::ERRORS);
+            if(bool(result) != kv.second)
+            {
+                std::string fail_reason = "For " + to_string(kv.first) + " validate() returns " + to_string(bool(result)) + " but expected answer is " + to_string(kv.second);
+                ARM_COMPUTE_TEST_INFO(fail_reason);
+                framework::ARM_COMPUTE_PRINT_INFO();
+            }
+        }
+        else
+        {
+            std::string skip_reason = "Skip supported datatype test because device does not support " + to_string(kv.first) + " vector operations.";
+            ARM_COMPUTE_TEST_INFO(skip_reason.c_str());
+            framework::ARM_COMPUTE_PRINT_INFO();
+        }
     }
 }
 
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index e428d7958b..c8c3f0bb49 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -145,7 +145,7 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL,
     cpu_isa.fp16 = (data_type == DataType::F16);
 
     const auto *selected_impl = CpuSoftmaxKernel::get_implementation(
-        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length()},
+        SoftmaxKernelDataTypeISASelectorData{ data_type, cpu_isa, false /* is_log */, 0 /* axis */, CPUInfo::get().get_sme2_vector_length_in_bits()},
         cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
@@ -232,6 +232,29 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::Dataset
 }
 TEST_SUITE_END() //FP16
 #endif           /* ARM_COMPUTE_ENABLE_FP16 */
+#ifdef ARM_COMPUTE_ENABLE_BF16
+constexpr AbsoluteTolerance<float> tolerance_bf16{0.02f};
+TEST_SUITE(BF16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<bfloat16>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapes(),
+        make("DataType", DataType::BFLOAT16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 })))
+{
+    if(CPUInfo::get().has_bf16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, tolerance_bf16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support bf16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() //BF16
+#endif /* ARM_COMPUTE_ENABLE_BF16 */
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index 0aab9ef9b5..f2863552e2 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,8 +65,14 @@ TEST_CASE(ImportMemory, framework::DatasetMode::ALL)
     ARM_COMPUTE_ASSERT(t1.info()->is_resizable());
 
     // Negative case : Import misaligned pointer
-    Tensor       t2;
-    const size_t required_alignment = 339;
+    Tensor t2;
+    size_t required_alignment = 339;
+    ARM_COMPUTE_ASSERT(data.get() != nullptr);
+    // If the data ptr is aligned with 339, keep adding 1 until it is misaligned.
+    while (arm_compute::utility::check_aligned(data.get(), required_alignment))
+    {
+        required_alignment += 1;
+    }
     t2.allocator()->init(info, required_alignment);
     ARM_COMPUTE_ASSERT(!bool(t2.allocator()->import_memory(data.get())));
     ARM_COMPUTE_ASSERT(t2.info()->is_resizable());
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index 289aca4d08..4c07134c35 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_VALIDATION_H
-#define ARM_COMPUTE_TEST_VALIDATION_H
+#ifndef ACL_TESTS_VALIDATION_VALIDATION_H
+#define ACL_TESTS_VALIDATION_VALIDATION_H
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/Types.h"
@@ -54,6 +54,14 @@ inline bool are_equal_infs(T val0, T val1)
     const auto same_sign = support::cpp11::signbit(val0) == support::cpp11::signbit(val1);
     return (!support::cpp11::isfinite(val0)) && (!support::cpp11::isfinite(val1)) && same_sign;
 }
+
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <>
+inline bool are_equal_infs(float16_t val0, float16_t val1)
+{
+    return are_equal_infs(static_cast<float>(val0), static_cast<float>(val1));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
 } // namespace
 
 /** Class reprensenting an absolute tolerance value. */
@@ -689,4 +697,4 @@ void validate_min_max_loc(const MinMaxLocationValues<T> &target, const MinMaxLoc
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_REFERENCE_VALIDATION_H */
+#endif // ACL_TESTS_VALIDATION_VALIDATION_H
diff --git a/tests/validation/fixtures/CastFixture.h b/tests/validation/fixtures/CastFixture.h
index 8297ec81dc..432df69b41 100644
--- a/tests/validation/fixtures/CastFixture.h
+++ b/tests/validation/fixtures/CastFixture.h
@@ -65,6 +65,10 @@ class CastValidationFixture : public framework::Fixture
                 case DataType::U8:
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
+                case DataType::QSYMM8:
+                case DataType::QSYMM8_PER_CHANNEL:
+                case DataType::QSYMM16:
+                case DataType::QASYMM16:
                 case DataType::S8:
                 case DataType::F32:
                 {
@@ -113,9 +117,13 @@ class CastValidationFixture : public framework::Fixture
 
     TensorType compute_target(const TensorShape &shape, DataType dt_in, DataType dt_out, ConvertPolicy policy)
     {
+        // These are necessary but not used qinfo for creating tensor buffer for QSYMM8_PER_CHANNEL
+        QuantizationInfo src_not_used_qinfo(0.25f, 2);
+        QuantizationInfo dst_not_used_qinfo(0.5f, 2);
+
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, dt_in, 1);
-        TensorType dst = create_tensor<TensorType>(shape, dt_out, 1);
+        TensorType src = create_tensor<TensorType>(shape, dt_in, 1, src_not_used_qinfo);
+        TensorType dst = create_tensor<TensorType>(shape, dt_out, 1, dst_not_used_qinfo);
 
         // Create and configure function
         FunctionType cast;
diff --git a/tests/validation/fixtures/CpuDequantizeFixture.h b/tests/validation/fixtures/CpuDequantizeFixture.h
new file mode 100644
index 0000000000..06352818fc
--- /dev/null
+++ b/tests/validation/fixtures/CpuDequantizeFixture.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H
+
+
+#include "tests/validation/fixtures/DequantizationLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuDequantizationValidationFixture : public DequantizationValidationFixture<TensorType,  AccessorType,  FunctionType, T>
+{
+public:
+    void setup(TensorShape shape, DataType src_data_type, DataType dst_datatype, DataLayout data_layout)
+    {
+        if(!cpu_supports_dtypes({src_data_type, dst_datatype})){
+            return;
+        }
+
+        this->_quantization_info = this->generate_quantization_info(src_data_type, shape.z());
+        this->_target            = this->compute_target(shape, src_data_type, dst_datatype, data_layout);
+        this->_reference         = this->compute_reference(shape, src_data_type);
+    }
+
+protected:
+    TensorType compute_target(TensorShape shape, DataType src_data_type, DataType dst_datatype, DataLayout data_layout)
+    {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, src_data_type, 1, this->_quantization_info, data_layout);
+        TensorType dst = create_tensor<TensorType>(shape, dst_datatype, 1, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType dequantization_layer;
+        dequantization_layer.configure(src.info(), dst.info());
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        this->fill(AccessorType(src));
+
+        // Prepare tensor pack
+        ITensorPack run_pack = { { arm_compute::TensorType::ACL_SRC, &src },
+                                { arm_compute::TensorType::ACL_DST, &dst } };
+
+        // Compute function
+        dequantization_layer.run(run_pack);
+
+        return dst;
+    }
+
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUDEQUANTIZEFIXTURE_H
diff --git a/tests/validation/fixtures/CpuGEMMLowpFixture.h b/tests/validation/fixtures/CpuGEMMLowpFixture.h
new file mode 100644
index 0000000000..91083ea0cf
--- /dev/null
+++ b/tests/validation/fixtures/CpuGEMMLowpFixture.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H
+
+#include "tests/validation/fixtures/GEMMLowpFixture.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+
+namespace {
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false, bool run_twice = false>
+TensorType compute_cpugemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
+                                   const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
+                                   bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN)
+{
+    ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
+        // If unknown, set to sensible defaults
+    if (data_type_output == DataType::UNKNOWN) {
+        data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
+    }
+
+    // Create tensors
+    TensorType a      = create_tensor<TensorType>(shape_a, data_type_a, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : a_qinfo);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1, dynamic_qinfo ? QuantizationInfo(1.0,0,true) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
+    TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1, output_qinfo /* output_qinfo will be ignored when output stage type is None */);
+
+    TensorType bias;
+    if(is_fused)
+    {
+        TensorShape bias_shape(shape_b[0]);
+        bias = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1);
+    }
+
+    // Create and configure function
+    // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
+    FunctionType gemmlowp;
+    gemmlowp.configure(a.info(), b.info(), is_fused ? bias.info() : nullptr, output.info(), GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false,
+                                                                             output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/,
+                                                                             arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
+                                                                             false /* pretranspose_B */, accumulate));
+
+    // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
+    if (dynamic_qinfo)
+    {
+        a.info()->set_quantization_info(QuantizationInfo(a_qinfo.scale(), a_qinfo.offset(), true));
+        b.info()->set_quantization_info(QuantizationInfo(b_qinfo.scale(), b_qinfo.offset(), true));
+    }
+
+    ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(output.info()->is_resizable());
+
+    add_padding_x({ &a, &b, &output });
+
+    // Allocate tensors
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    output.allocator()->allocate();
+
+    ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+    ARM_COMPUTE_ASSERT(!output.info()->is_resizable());
+
+    // telhs are newly created every call of this lambda function
+    ITensorPack pack =
+    {
+        { arm_compute::TensorType::ACL_SRC_0, &a },
+        { arm_compute::TensorType::ACL_SRC_1, &b },
+        { arm_compute::TensorType::ACL_DST, &output }
+    };
+
+    // Fill tensors
+    fill_quantized(AccessorType(a), 0 + finfo.hash);
+    fill_quantized(AccessorType(b), 1 + finfo.hash);
+
+    if (accumulate)
+    {
+        ARM_COMPUTE_ASSERT(accumulate != run_twice);
+        fill(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output);
+    }
+
+    if(is_fused)
+    {
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        bias.allocator()->allocate();
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        fill(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias);
+        pack.add_tensor(arm_compute::TensorType::ACL_SRC_2, &bias);
+    }
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemmlowp.workspace(), mg, pack, pack);
+    allocate_tensors(gemmlowp.workspace(), ws);
+
+    // Run with variable inputs.
+    if(run_twice)
+    {
+        gemmlowp.run(pack);
+        fill_quantized(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run
+        fill_quantized(AccessorType(b), 4 + finfo.hash);
+        if(is_fused)
+        {
+            fill(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias);
+        }
+    }
+
+    // Compute GEMM function
+    gemmlowp.run(pack);
+    return output;
+}
+} // namespace
+
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, bool run_twice = false>
+class CpuGEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
+{
+public:
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    {
+        const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
+        const auto b_qinfo = QuantizationInfo(2.0f / 255, b_offset);
+        TensorFillInfo finfo;
+
+        bool accumulate = false;
+        bool dynamic_qinfo = false;
+        this->_target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
+        this->_reference = this->compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
+    }
+
+protected:
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
+    {
+        const auto output_qinfo = QuantizationInfo(); // No output stage
+        return compute_cpugemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate, dynamic_qinfo);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMLOWPFIXTURE_H
diff --git a/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h b/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h
index fc070eb7a0..5d74e210d5 100644
--- a/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h
+++ b/tests/validation/fixtures/CpuGemmAssemblyDispatchFixture.h
@@ -24,11 +24,15 @@
 #ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMASSEMBLYDISPATCHFIXTURE_H
 #define ACL_TESTS_VALIDATION_FIXTURES_CPUGEMMASSEMBLYDISPATCHFIXTURE_H
 
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
+
 #include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/reference/ActivationLayer.h"
 #include "tests/validation/reference/GEMM.h"
-#include "arm_compute/core/Helpers.h"
 
 namespace arm_compute
 {
@@ -40,19 +44,27 @@ template <typename TensorType, typename AccessorType, typename FunctionType, typ
 class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a,
-               TensorShape shape_b,
-               TensorShape shape_c,
-               TensorShape output_shape,
-               float       alpha,
-               float       beta,
-               DataType    data_type,
-               bool        accumulate)
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         shape_c,
+               TensorShape         output_shape,
+               float               alpha,
+               float               beta,
+               DataType            data_type,
+               bool                accumulate,
+               bool                pretranspose_b,
+               ActivationLayerInfo act_info)
     {
+        if(std::is_same<TensorType, Tensor>::value &&  // Cpu
+            data_type == DataType::F16 && !CPUInfo::get().has_fp16())
+        {
+            return;
+        }
         ARM_COMPUTE_UNUSED(alpha);
         ARM_COMPUTE_UNUSED(beta);
-        _target    = compute_target(shape_a, shape_b, shape_c, output_shape, data_type, accumulate);
-        _reference = compute_reference(shape_a, shape_b, output_shape, data_type, accumulate);
+        _target =
+            compute_target(shape_a, shape_b, shape_c, output_shape, data_type, accumulate, pretranspose_b, act_info);
+        _reference = compute_reference(shape_a, shape_b, output_shape, data_type, accumulate, act_info);
     }
 
 protected:
@@ -78,48 +90,56 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur
         }
     }
 
-    TensorType compute_target(const TensorShape &shape_a,
-                              const TensorShape &shape_b,
-                              const TensorShape &shape_c,
-                              const TensorShape &output_shape,
-                              DataType           data_type,
-                              bool               accumulate)
+    TensorType compute_target(const TensorShape  &shape_a,
+                              const TensorShape  &shape_b,
+                              const TensorShape  &shape_c,
+                              const TensorShape  &output_shape,
+                              DataType            data_type,
+                              bool                accumulate,
+                              bool                pretranspose_b,
+                              ActivationLayerInfo act_info)
     {
         ARM_COMPUTE_UNUSED(shape_c);
         // Create tensors
-        TensorType  a   = create_tensor<TensorType>(shape_a, data_type, 1);
-        TensorType  b   = create_tensor<TensorType>(shape_b, data_type, 1);
-        TensorType *c   = nullptr;
-        TensorType  dst = create_tensor<TensorType>(output_shape, data_type, 1);
+        TensorType  a            = create_tensor<TensorType>(shape_a, data_type, 1);
+        TensorType  b            = create_tensor<TensorType>(shape_b, data_type, 1);
+        TensorType  b_transposed = create_tensor<TensorType>({shape_b[1], shape_b[0]}, data_type, 1);
+        TensorType *c            = nullptr;
+        TensorType  dst          = create_tensor<TensorType>(output_shape, data_type, 1);
 
         // Create and configure function
         FunctionType gemm;
+        NETranspose  transpose;
 
-        add_padding_x({&a, &b, &dst});
+        add_padding_x({&a, &b, &b_transposed, &dst});
 
         GEMMInfo gemm_info;
         gemm_info.set_accumulate(accumulate);
+        gemm_info.set_pretranspose_B(pretranspose_b);
+        gemm_info.set_activation_info(act_info);
+
+        TensorType &b_to_use = pretranspose_b ? b_transposed : b;
 
-        ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b.info(), nullptr, dst.info(), gemm_info));
+        ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b_to_use.info(), nullptr, dst.info(), gemm_info));
 
-        // The GEMMinfo includes the values of the depth in case of reinterpreted 3d output.
-        // If the output shape has the same number of dimensions of the input the method called is a 2D matrix multiplication (depth_output_reinterpreted_as_3D = 0),
-        // in the other case we have to use the reinterpreted version of GEMM (depth_output_reinterpreted_as_3D = depth of the 3D output).
-        gemm.configure(a.info(), b.info(), nullptr, dst.info(), gemm_info);
+        gemm.configure(a.info(), b_to_use.info(), nullptr, dst.info(), gemm_info);
 
         ARM_COMPUTE_ASSERT(gemm.is_configured());
 
         ARM_COMPUTE_ASSERT(a.info()->is_resizable());
         ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b_transposed.info()->is_resizable());
         ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
 
         // Allocate tensors
         a.allocator()->allocate();
         b.allocator()->allocate();
+        b_transposed.allocator()->allocate();
         dst.allocator()->allocate();
 
         ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b_transposed.info()->is_resizable());
         ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
 
         // Fill tensors
@@ -130,13 +150,19 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur
             fill(AccessorType(dst), 6);
         };
 
+        if (pretranspose_b)
+        {
+            transpose.configure(&b, &b_transposed);
+            transpose.run();
+        }
+
         ITensorPack run_pack{{arm_compute::TensorType::ACL_SRC_0, &a},
-                             {arm_compute::TensorType::ACL_SRC_1, &b},
+                             {arm_compute::TensorType::ACL_SRC_1, &b_to_use},
                              {arm_compute::TensorType::ACL_SRC_2, c},
                              {arm_compute::TensorType::ACL_DST_0, &dst}};
 
         // Prepare memory
-        ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b}, {arm_compute::TensorType::ACL_SRC_2, c}};
+        ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b_to_use}, {arm_compute::TensorType::ACL_SRC_2, c}};
 
         experimental::MemoryRequirements aux_mem_req = gemm.workspace();
         MemoryGroup                      memory_group{};
@@ -157,7 +183,7 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur
         }
         else
         {
-            run_pack.add_const_tensor(ACL_SRC_1, &b);
+            run_pack.add_const_tensor(ACL_SRC_1, &b_to_use);
         }
 
         // Release temporary tensors that are only used in prepare stage
@@ -169,15 +195,17 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur
 
         a.allocator()->free();
         b.allocator()->free();
+        b_transposed.allocator()->free();
 
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape_a,
-                                      const TensorShape &shape_b,
-                                      const TensorShape &output_shape,
-                                      DataType           data_type,
-                                      bool               accumulate)
+    SimpleTensor<T> compute_reference(const TensorShape  &shape_a,
+                                      const TensorShape  &shape_b,
+                                      const TensorShape  &output_shape,
+                                      DataType            data_type,
+                                      bool                accumulate,
+                                      ActivationLayerInfo act_info)
     {
         // Create reference
         SimpleTensor<T> a{shape_a, data_type, 1};
@@ -196,28 +224,52 @@ class CpuGemmAssemblyDispatchGenericValidationFixture : public framework::Fixtur
             fill(dst, 6);
         }
 
-        // Setting beta to 0 will effectively disable C for the
-        // computation of the reference: A * B + 0 * C
-        // Use transposed tensors if boolean enabled else use original tensors
         if (accumulate)
         {
             reference::gemm_accumulate<T>(a, b, c, 1.0f, 0.f, dst);
-            return dst;
         }
         else
         {
-            return reference::gemm<T>(a, b, c, 1.f, 0.f);
+            dst = reference::gemm<T>(a, b, c, 1.f, 0.f);
         }
+
+        if (act_info.enabled())
+        {
+            return reference::activation_layer<T>(dst, act_info);
+        }
+        return dst;
     }
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool accumulate>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
 class CpuGemmAssemblyDispatchValidationFixture
     : protected CpuGemmAssemblyDispatchGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
 {
+public:
+    void setup(TensorShape         shape_a,
+               TensorShape         shape_b,
+               TensorShape         shape_c,
+               TensorShape         output_shape,
+               float               alpha,
+               float               beta,
+               DataType            data_type,
+               bool                accumulate,
+               bool                pretranspose_b,
+               ActivationLayerInfo act_info)
+    {
+        CpuGemmAssemblyDispatchGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
+            shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate, pretranspose_b, act_info);
+    }
+};
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuGemmAssemblyDispatchFixedFormatFixture
+    : protected CpuGemmAssemblyDispatchGenericValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
 public:
     void setup(TensorShape shape_a,
                TensorShape shape_b,
@@ -227,11 +279,159 @@ class CpuGemmAssemblyDispatchValidationFixture
                float       beta,
                DataType    data_type)
     {
-        CpuGemmAssemblyDispatchGenericValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(
-            shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate);
+        ARM_COMPUTE_UNUSED(alpha);
+        ARM_COMPUTE_UNUSED(beta);
+        this->_target = compute_target(shape_a, shape_b, shape_c, output_shape, data_type);
+        this->_reference =
+            this->compute_reference(shape_a, shape_b, output_shape, data_type, false, ActivationLayerInfo());
     }
+
+protected:
+    inline TensorInfo prepare_weights(const TensorInfo tensor_info, const arm_compute::WeightFormat weight_format)
+    {
+        const DataLayout  data_layout  = tensor_info.data_layout();
+        const DataType    data_type    = tensor_info.data_type();
+        const TensorShape tensor_shape = tensor_info.tensor_shape();
+        const int N = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)]; // N=O
+        const int H = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)];
+        const int W = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)];
+        const int C = tensor_shape[get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)]; // C=I
+
+        const int interleave_by = arm_compute::interleave_by(weight_format);
+        const int block_by      = arm_compute::block_by(weight_format);
+        const int Ip            = arm_gemm::roundup<unsigned int>(C, block_by);      // C'=I'
+        const int Op            = arm_gemm::roundup<unsigned int>(N, interleave_by); // O'=N'
+
+        arm_compute::Strides strides_in_bytes = tensor_info.strides_in_bytes();
+        strides_in_bytes.set(1, Ip * interleave_by * W * tensor_info.element_size());
+        strides_in_bytes.set(2, Op * interleave_by * W * tensor_info.element_size());
+
+        const size_t offset_first_element_in_bytes = tensor_info.offset_first_element_in_bytes();
+
+        // Total size needs to include padded dimensions
+        const size_t total_size_in_bytes = Op * H * W * Ip * tensor_info.element_size();
+
+        const TensorShape TS({tensor_shape[0], arm_compute::ceil_to_multiple<int32_t, int32_t>(tensor_shape[1], 4)});
+
+        TensorInfo new_tensor_info = tensor_info;
+        new_tensor_info.set_data_layout(DataLayout::UNKNOWN);
+        new_tensor_info.init(TS, tensor_info.num_channels(), data_type, strides_in_bytes, offset_first_element_in_bytes,
+                             total_size_in_bytes);
+        return new_tensor_info;
+    }
+
+    TensorType compute_target(
+        TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, DataType data_type)
+    {
+        ARM_COMPUTE_UNUSED(shape_c);
+        permute(shape_b, PermutationVector(1U, 0U));
+        // Create tensors
+        TensorType a   = create_tensor<TensorType>(shape_a, data_type, 1, QuantizationInfo(), DataLayout::NCHW);
+        TensorType b   = create_tensor<TensorType>(shape_b, data_type, 1, QuantizationInfo(), DataLayout::NCHW);
+        TensorType c   = nullptr;
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, QuantizationInfo(), DataLayout::NCHW);
+
+        // Create and configure function
+        FunctionType              gemm;
+        NEReorderLayer            reorder;
+        arm_compute::WeightFormat computed_weight_format{arm_compute::WeightFormat::ANY};
+        GEMMInfo                  gemm_info;
+
+        gemm_info.set_fixed_format(true);
+        gemm_info.set_accumulate(false);
+        gemm_info.set_weight_format(computed_weight_format);
+
+        const bool kernel_found = bool(
+            FunctionType::has_opt_impl(computed_weight_format, a.info(), b.info(), nullptr, dst.info(), gemm_info));
+
+        ARM_COMPUTE_ASSERT(kernel_found);
+        gemm_info.set_weight_format(computed_weight_format);
+        gemm_info.set_fast_math(is_fixed_format_fast_math(computed_weight_format));
+        TensorType b_transformed = create_tensor<TensorType>(prepare_weights(*b.info(), computed_weight_format));
+
+        a.info()->set_are_values_constant(false);
+        b_transformed.info()->set_are_values_constant(false);
+
+        ARM_COMPUTE_ASSERT(a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(b_transformed.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        a.allocator()->allocate();
+        b.allocator()->allocate();
+        b_transformed.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!a.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!b_transformed.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        this->fill(AccessorType(a), 0, -1.f, 1.f);
+        this->fill(AccessorType(b), 1, -1.f, 1.f);
+
+        // Reorder weight to the expected format
+        reorder.configure(&b, &b_transformed, WeightFormat::OHWI, computed_weight_format);
+        reorder.run();
+
+        ARM_COMPUTE_ASSERT(gemm.validate(a.info(), b_transformed.info(), nullptr, dst.info(), gemm_info));
+        gemm.configure(a.info(), b_transformed.info(), nullptr, dst.info(), gemm_info);
+        ARM_COMPUTE_ASSERT(gemm.is_configured());
+
+        ITensorPack run_pack;
+        run_pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_0, &a);
+        run_pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &b_transformed);
+        run_pack.add_tensor(arm_compute::TensorType::ACL_SRC_2, &c);
+        run_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst);
+
+        // Prepare memory
+        ITensorPack prep_pack{{arm_compute::TensorType::ACL_SRC_1, &b_transformed},
+                              {arm_compute::TensorType::ACL_SRC_2, &c}};
+
+        experimental::MemoryRequirements aux_mem_req = gemm.workspace();
+        MemoryGroup                      memory_group{};
+
+        WorkspaceData<Tensor> workspace = manage_workspace<Tensor>(aux_mem_req, memory_group, run_pack, prep_pack);
+
+        gemm.prepare(prep_pack);
+        MemoryGroupResourceScope scope_mg(memory_group);
+
+        auto has_reshape = std::find_if(aux_mem_req.begin(), aux_mem_req.end(),
+                                        [](const arm_compute::experimental::MemoryInfo &m) -> bool {
+                                            return m.lifetime == arm_compute::experimental::MemoryLifetime::Persistent;
+                                        });
+
+        if (has_reshape != std::end(aux_mem_req))
+        {
+            b_transformed.mark_as_unused();
+        }
+        else
+        {
+            run_pack.add_const_tensor(ACL_SRC_1, &b_transformed);
+        }
+
+        // Release temporary tensors that are only used in prepare stage
+        release_temporaries<Tensor>(aux_mem_req, workspace);
+        // End of preparing
+
+        gemm.run(run_pack);
+
+        a.allocator()->free();
+        b.allocator()->free();
+        b_transformed.allocator()->free();
+
+        return dst;
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+    bool            _kernel_found{false};
 };
 
+#endif //ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/CpuGemmConv2dFixture.h b/tests/validation/fixtures/CpuGemmConv2dFixture.h
index c8e82fb8a0..67ba4e74db 100644
--- a/tests/validation/fixtures/CpuGemmConv2dFixture.h
+++ b/tests/validation/fixtures/CpuGemmConv2dFixture.h
@@ -158,6 +158,183 @@ class CpuGemmConv2dValidationFixture : public framework::Fixture
     DataLayout          _data_layout{DataLayout::NHWC};
 };
 
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
+class CpuGemmConv2dStaticQuantValidationFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights,
+               DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info)
+    {
+        ARM_COMPUTE_ASSERT(data_type == DataType::QASYMM8_SIGNED || data_type == DataType::QASYMM8);
+
+        // This hash is used by random generators. There may be hash collisions but
+        // this is intentional as it's a very easy way to make the the current
+        // random generation process almost different for many test configurations,
+        // which were using the same set of values before.
+        this->_hash = input_shape[0] + input_shape[1] + input_shape[2] + input_shape[3] +
+            + weights_shape[0] + weights_shape[1] + weights_shape[2] + weights_shape[3] +
+              (data_type == DataType::QASYMM8_SIGNED) + (data_layout == DataLayout::NHWC);
+
+        this->_data_type                = data_type;
+        this->_weights_data_type        = weights_data_type;
+        this->_bias_data_type           = DataType::S32;
+        this->_output_data_type         = data_type;
+        this->_quantization_info        = quantization_info;
+        this->_weight_quantization_info = weight_quantization_info;
+        this->_data_layout              = data_layout;
+        this->_dst_q_info               = quantization_info;
+
+        if(!is_data_type_quantized_symmetric(weights_data_type) && (!act_info.enabled() || act_info.activation() == ActivationFunction::IDENTITY))
+        {
+            this->setup_quantization(input_shape, weights_shape, this->_quantization_info, this->_weight_quantization_info, data_type);
+            this->_use_dynamic_output_quant = true;
+        }
+
+        this->_target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info);
+
+        this->_reference = this->compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info);
+    }
+
+protected:
+
+    // Compute the target when updating static quantization information after configuration for the stateless api.
+    TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info,
+                              bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false)
+    {
+        ARM_COMPUTE_ASSERT((std::is_same<FunctionType, experimental::op::CpuGemmConv2d>::value == true));
+
+        ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0);
+
+        const unsigned int num_groups = input_shape[2] / weights_shape[2];
+
+        if(this->_data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+            permute(weights_shape, PermutationVector(2U, 0U, 1U));
+            permute(output_shape, PermutationVector(2U, 0U, 1U));
+
+            if(pre_pad_layer.size() > 0)
+            {
+                // make sure paddings exist for each c,h,w dimensions
+                for(unsigned int i = 0; i < 3 - pre_pad_layer.size(); ++i)
+                {
+                    pre_pad_layer.push_back({ 0, 0 });
+                }
+
+                // rotate padding info from nchw to nhwc
+                std::rotate(pre_pad_layer.begin(), pre_pad_layer.begin() + 2, pre_pad_layer.begin() + 3);
+            }
+        }
+
+        const int idx_width  = get_data_layout_dimension_index(this->_data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(this->_data_layout, DataLayoutDimension::HEIGHT);
+
+        WeightsInfo weights_info(!reshape_weights, weights_shape[idx_width], weights_shape[idx_height], weights_shape[3]);
+        TensorShape reshaped_weights_shape(weights_shape);
+
+        // Create tensors with fake quantization info and defer to pass the correct ones to a later stage.
+        auto qi = QuantizationInfo(0.550721, 37, true);
+        TensorType src     = create_tensor<TensorType>(input_shape, this->_data_type, 1, qi, this->_data_layout);
+        TensorType weights = create_tensor<TensorType>(reshaped_weights_shape, this->_weights_data_type, 1, qi, this->_data_layout);
+        TensorType dst     = create_tensor<TensorType>(output_shape, this->_output_data_type, 1, qi, this->_data_layout);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, this->_bias_data_type, 1, QuantizationInfo() /*bias is not a quantized type*/, this->_data_layout);
+
+        // Create and configure function
+        FunctionType conv;
+
+        const unsigned int height_index = arm_compute::graph::get_dimension_idx(this->_data_layout, DataLayoutDimension::HEIGHT);
+        const unsigned int width_index  = arm_compute::graph::get_dimension_idx(this->_data_layout, DataLayoutDimension::WIDTH);
+
+        const PaddingInfo pad_w = width_index < pre_pad_layer.size() ? pre_pad_layer[width_index] : PaddingInfo(0, 0);
+        const PaddingInfo pad_h = height_index < pre_pad_layer.size() ? pre_pad_layer[height_index] : PaddingInfo(0, 0);
+
+        if(pre_pad_layer.size() > 0 && arm_compute::graph::is_padding_in_height_or_width(this->_data_layout, pre_pad_layer))
+        {
+            // this is the logic implemented in NodeFusionMutator -> fuse_pad_with_convolution
+            const PadStrideInfo new_conv_info(
+                info.stride().first,
+                info.stride().second,
+                info.pad_left() + pad_w.first,
+                info.pad_right() + pad_w.second,
+                info.pad_top() + pad_h.first,
+                info.pad_bottom() + pad_h.second,
+                info.round());
+
+            conv.configure(src.info(), weights.info(), bias.info(), dst.info(), new_conv_info, weights_info, dilation, act_info, false /* enable_fast_math */, num_groups);
+            auto const status = conv.validate(src.info(), weights.info(), bias.info(), dst.info(), new_conv_info);
+            ARM_COMPUTE_ASSERT(status);
+        }
+        else
+        {
+            conv.configure(src.info(), weights.info(), bias.info(), dst.info(), info, weights_info, dilation, act_info, false /* enable_fast_math */, num_groups);
+            auto const status = conv.validate(src.info(), weights.info(), bias.info(), dst.info(), info);
+            ARM_COMPUTE_ASSERT(status);
+        }
+
+        // After calling configure, we appropriately set the correct quantization info and update ACL.
+        src.info()->set_quantization_info(QuantizationInfo(this->_quantization_info.scale(), this->_quantization_info.offset(), true));
+        weights.info()->set_quantization_info(QuantizationInfo(this->_weight_quantization_info.scale(), this->_weight_quantization_info.offset(), true));
+        dst.info()->set_quantization_info(QuantizationInfo(this->_dst_q_info.scale(), this->_dst_q_info.offset(), true));
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Test "add padding after configure" behavior. This behavior should not affect the correctness
+        add_padding_x({ &src, &bias, &dst }, this->_data_layout);
+        // Padding weights may affect code path in some backends
+        if (padded_weights)
+        {
+            add_padding_x({ &weights }, this->_data_layout);
+        }
+
+        // // Allocate tensors
+        src.allocator()->allocate();
+        weights.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ITensorPack run_pack{
+            {ACL_SRC_0, &src}, {ACL_SRC_1, &weights}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}};
+        ITensorPack prep_pack{{ACL_SRC_1, &weights}, {ACL_SRC_2, &bias}};
+
+        // propagate trough ACL the correct quantization info
+        conv.update_quantization_parameters(run_pack);
+
+        auto mg = MemoryGroup{};
+        auto ws = manage_workspace<Tensor>(conv.workspace(), mg, run_pack, prep_pack);
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!weights.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        this->fill(AccessorType(src), 0 + this->_hash);
+        this->fill(AccessorType(weights), 1 + this->_hash);
+        this->fill(AccessorType(bias), 2 + this->_hash);
+
+        // Compute Convolution function
+        conv.prepare(prep_pack);
+        conv.run(run_pack);
+
+        return dst;
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class CpuGemmConv2dForUpdatedStaticQuantInfoAfterConfigureFixture : public CpuGemmConv2dStaticQuantValidationFixture<TensorType, AccessorType, FunctionType, T, T>
+{
+public:
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type,
+               DataLayout data_layout, QuantizationInfo quantization_info, ActivationLayerInfo act_info)
+    {
+        CpuGemmConv2dStaticQuantValidationFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights,
+                                                                                                 data_type, data_type, data_layout, quantization_info, quantization_info, act_info);
+    }
+};
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/fixtures/CpuQuantizeFixture.h b/tests/validation/fixtures/CpuQuantizeFixture.h
new file mode 100644
index 0000000000..fb2be16339
--- /dev/null
+++ b/tests/validation/fixtures/CpuQuantizeFixture.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H
+
+#include "tests/validation/fixtures/QuantizationLayerFixture.h"
+#include "tests/validation/Helpers.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "tests/validation/Helpers.h"
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename Tin, typename Tout>
+class CpuQuantizationValidationFixture : public QuantizationValidationFixture<TensorType, AccessorType, FunctionType, Tin, Tout>
+{
+public:
+void setup(TensorShape shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo)
+{
+    QuantizationInfo qinfo_in;
+    if(std::is_same<TensorType, Tensor>::value &&  // Cpu
+        (data_type_in == DataType::F16 || data_type_out == DataType::F16) && !CPUInfo::get().has_fp16())
+    {
+        return;
+    }
+
+    if(!cpu_supports_dtypes({data_type_in, data_type_out})) {
+        return;
+    }
+
+    this->_target    = compute_target(shape, data_type_in, data_type_out, qinfo, qinfo_in);
+    this->_reference = this->compute_reference(shape, data_type_in, data_type_out, qinfo, qinfo_in);
+}
+
+protected:
+    TensorType compute_target(const TensorShape &shape, DataType data_type_in, DataType data_type_out, QuantizationInfo qinfo, QuantizationInfo qinfo_in)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, data_type_in, 1, qinfo_in);
+        TensorType dst = create_tensor<TensorType>(shape, data_type_out, 1, qinfo);
+
+        // Create and configure function
+        FunctionType quantization_layer;
+        quantization_layer.configure(src.info(), dst.info());
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        this->fill(AccessorType(src));
+
+        // Prepare tensor pack
+        ITensorPack run_pack = { { arm_compute::TensorType::ACL_SRC, &src },
+                                { arm_compute::TensorType::ACL_DST, &dst } };
+        auto mg = MemoryGroup{};
+        auto ws = arm_compute::manage_workspace<TensorType>(quantization_layer.workspace(), mg, run_pack);
+        allocate_tensors(quantization_layer.workspace(), ws);
+
+        // Compute function
+        quantization_layer.run(run_pack);
+
+        return dst;
+    }
+};
+
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+
+
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUQUANTIZEFIXTURE_H
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 854442b174..e572f41ec4 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -95,7 +95,7 @@ template <typename TensorType, typename AccessorType, typename FunctionType, boo
 TensorType compute_gemmlowp_target_for_updated_sq_info_after_config(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
                                    const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
                                    GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
-                                   bool accumulate = false, DataType data_type_output = DataType::UNKNOWN)
+                                   bool accumulate = false, DataType data_type_output = DataType::UNKNOWN, const ActivationLayerInfo& act_info = ActivationLayerInfo())
 {
     ARM_COMPUTE_ASSERT((std::is_same<FunctionType, NEGEMMLowpMatrixMultiplyCore>::value == true));
     ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a));
@@ -125,7 +125,7 @@ TensorType compute_gemmlowp_target_for_updated_sq_info_after_config(const Tensor
 
     gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false,
                                                                              output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/,
-                                                                             arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
+                                                                             act_info, false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
                                                                              false /* pretranspose_B */, accumulate));
 
     ARM_COMPUTE_ASSERT(a.info()->is_resizable());
@@ -216,7 +216,7 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape
     FunctionType gemmlowp;
     gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false,
                                                                              output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/,
-                                                                             arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
+                                                                             ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED,
                                                                              false /* pretranspose_B */, accumulate));
 
     // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
@@ -467,9 +467,15 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ
                                         const QuantizationInfo& a_qinfo,
                                         const QuantizationInfo& b_qinfo,
                                         const QuantizationInfo& output_qinfo,
+                                        const ActivationLayerInfo& act_info,
                                         GEMMLowpOutputStageType type,
                                         GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
     {
+        // Supported activations in GEMM
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
         ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_quantized_asymmetric(data_type));
 
         const UniformQuantizationInfo aq_unif   = a_qinfo.uniform();
@@ -485,7 +491,14 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ
 
         int32_t type_min             = 0;
         int32_t type_max             = 0;
-        std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type);
+
+        if (supported_acts.find(act_info.activation()) != supported_acts.end())
+        {
+            std::tie(type_min, type_max) =
+                arm_compute::get_quantized_activation_min_max(act_info, data_type, oq_unif);
+        } else {
+            std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(output_qinfo, ActivationLayerInfo(), data_type);
+        }
 
         gemmlowp_output_stage_info.gemmlowp_real_multiplier = multiplier;
         gemmlowp_output_stage_info.gemmlowp_multiplier = int_multiplier;
@@ -507,7 +520,7 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ
      *
      */
     void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type,
-               bool reshape_b_only_on_first_run, bool updated_sq_info_after_config = false)
+               bool reshape_b_only_on_first_run, bool updated_sq_info_after_config = false, const ActivationLayerInfo& act_info = ActivationLayerInfo())
     {
         ARM_COMPUTE_ASSERT(output_stage_type != GEMMLowpOutputStageType::NONE);
         ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type));
@@ -521,20 +534,20 @@ class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : publ
         setup_quantization<TI>(data_type, shape_a, shape_b, a_qinfo, b_qinfo, output_qinfo, finfo);
 
         GEMMLowpOutputStageInfo output_stage;
-        init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, output_stage_type, output_stage);
+        init_gemmlowp_output_stage_info(data_type, a_qinfo, b_qinfo, output_qinfo, act_info, output_stage_type, output_stage);
 
         _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, output_stage, finfo);
-        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo, updated_sq_info_after_config);
+        _target    = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, output_stage, reshape_b_only_on_first_run, finfo, updated_sq_info_after_config, act_info);
     }
 
 protected:
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo,
-                              DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), bool updated_sq_info_after_config = false)
+                              DataType data_type_a, DataType data_type_b, const GEMMLowpOutputStageInfo& output_stage, bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), bool updated_sq_info_after_config = false, const ActivationLayerInfo& act_info = ActivationLayerInfo())
     {
         if (updated_sq_info_after_config)
         {
             return compute_gemmlowp_target_for_updated_sq_info_after_config<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true, run_twice>(shape_a, shape_b, shape_output, a_qinfo,
-                    b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo);
+                    b_qinfo, output_qinfo, data_type_a, data_type_b, output_stage, reshape_b_only_on_first_run, finfo, false, arm_compute::DataType::UNKNOWN, act_info);
         }
         else
         {
diff --git a/tests/validation/fixtures/PermuteFixture.h b/tests/validation/fixtures/PermuteFixture.h
index b1b3845a8d..7dacf5181a 100644
--- a/tests/validation/fixtures/PermuteFixture.h
+++ b/tests/validation/fixtures/PermuteFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_PERMUTE_FIXTURE
-#define ARM_COMPUTE_TEST_PERMUTE_FIXTURE
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorShape.h"
@@ -33,6 +33,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/reference/Permute.h"
 
 namespace arm_compute
@@ -47,6 +48,12 @@ class PermuteValidationFixture : public framework::Fixture
 public:
     void setup(TensorShape input_shape, PermutationVector perm, DataType data_type)
     {
+        if (std::is_same<TensorType, Tensor>::value && // CPU
+            !cpu_supports_dtypes({data_type}))
+        {
+            return;
+        }
+
         _target    = compute_target(input_shape, data_type, perm);
         _reference = compute_reference(input_shape, data_type, perm);
     }
@@ -108,4 +115,4 @@ class PermuteValidationFixture : public framework::Fixture
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_PERMUTE_FIXTURE */
+#endif // ACL_TESTS_VALIDATION_FIXTURES_PERMUTEFIXTURE_H
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index 399a8b70c4..582018f2f1 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -73,8 +73,9 @@ class SoftmaxValidationGenericFixture : public framework::Fixture
         {
             arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -10.0f, 10.0f };
             library->fill(tensor, distribution, 0);
-        }
-        else if(!is_data_type_quantized(tensor.data_type()))
+        }else if(tensor.data_type() == DataType::BFLOAT16){
+            library->fill_tensor_uniform(tensor, 0);
+        }else if(!is_data_type_quantized(tensor.data_type()))
         {
             std::uniform_int_distribution<> distribution(0, 100);
             library->fill(tensor, distribution, 0);
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 30c577d850..0eb1f46848 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -130,14 +130,17 @@ void quantize_down_scale_by_fixedpoint(const SimpleTensor<TIn> *in, const Simple
         }
         result += result_offset_after_shift;
 
-        // Bounded ReLu
-        if(min != max)
+        result = std::max(min, std::min(max, result));
+
+        if (min == std::numeric_limits<TIn>::min() && max == std::numeric_limits<TIn>::max())
         {
-            result = std::max(min, std::min(max, result));
+            (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
+                                                                std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
+        }
+        else
+        {
+            (*dst)[i] = static_cast<TOut>(result);
         }
-
-        (*dst)[i] = static_cast<TOut>(std::max<TIn>(std::numeric_limits<TOut>::lowest(),
-                                                    std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
     }
 }
 
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 3fbac32a9b..08c091b4a9 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,19 +103,97 @@ SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, in
     return dst;
 }
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> softmax_layer_bfloat16(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
+{
+    // Create reference
+    SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
+
+    const int32_t n_dims = static_cast<int32_t>(src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(axis < -n_dims || axis >= n_dims);
+
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, n_dims));
+    Window             window;
+    window.use_tensor_dimensions(src.shape());
+    const unsigned int axis_dimension = src.shape()[actual_axis];
+    window.set(actual_axis, Window::Dimension(0, 1, 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Find max along axis
+        Coordinates offset(id);
+        offset.set(actual_axis, 0);
+        T max = *reinterpret_cast<const T *>(src(offset));
+        float max_f = float(max);
+        for(unsigned int axis_id = 1; axis_id < axis_dimension; ++axis_id)
+        {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+            float val_f = float(val);
+
+            if(val_f > max_f)
+            {
+                max_f = val_f;
+            }
+        }
+
+        // Regularize
+        float sum(0.f);
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
+        {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+
+            float val_f = float(val);
+            float beta_f = float(beta);
+            float       res{ (val_f - max_f) *beta_f };
+            if(is_log)
+            {
+                sum += std::exp(res);
+            }
+            else
+            {
+                res = std::exp(res);
+                sum += res;
+            }
+            *reinterpret_cast<T *>(dst(offset)) = res;
+        }
+
+        // Normalize
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
+        {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(dst(offset));
+            float val_f = float(val);
+            if(is_log)
+            {
+                *reinterpret_cast<T *>(dst(offset)) = val - static_cast<T>(std::log(sum));
+            }
+            else
+            {
+                *reinterpret_cast<T *>(dst(offset)) = (val_f / sum);
+            }
+        }
+    });
+    return dst;
+}
+
 template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
 template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
-    return softmax_layer_generic<T>(src, beta, axis, is_log);
+    if(std::is_same<T, bfloat16>::value){
+        return softmax_layer_bfloat16<T>(src, beta, axis, is_log);
+    }else{
+        return softmax_layer_generic<T>(src, beta, axis, is_log);
+    }
 }
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
-{
-    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log);
+{    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
     SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis, is_log);
@@ -127,6 +205,7 @@ template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float
 template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
 template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t axis, bool is_log);
 template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<bfloat16> softmax_layer(const SimpleTensor<bfloat16> &src, float beta, int32_t axis, bool is_log);
 
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index 3362f195c9..609d10b3ac 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_SOFTMAX_LAYER_H
-#define ARM_COMPUTE_TEST_SOFTMAX_LAYER_H
+#ifndef ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H
+#define ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H
 
 #include "tests/SimpleTensor.h"
 #include "tests/validation/Helpers.h"
@@ -38,6 +38,9 @@ namespace reference
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log = false);
 
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> softmax_layer_bfloat16(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log = false);
+
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = 0, bool is_log = false);
 
@@ -47,4 +50,4 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t ax
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SOFTMAX_LAYER_H */
+#endif // ACL_TESTS_VALIDATION_REFERENCE_SOFTMAXLAYER_H
diff --git a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
index 613ec24bff..a5f3323160 100644
--- a/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
+++ b/tests/validation/runtime/experimental/low_level/CpuGemmAssemblyDispatch.cpp
@@ -24,6 +24,8 @@
 #include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h"
 
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "tests/datasets/DatatypeDataset.h"
 #include "tests/datasets/LargeGEMMDataset.h"
 #include "tests/datasets/SmallGEMMDataset.h"
 #include "tests/framework/Asserts.h"
@@ -137,7 +139,8 @@ TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
     auto result_1 = run_conv();
     for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
     {
-        ARM_COMPUTE_EXPECT(reinterpret_cast<float *>(result_0.buffer())[i] == reinterpret_cast<float *>(result_1.buffer())[i],
+        ARM_COMPUTE_EXPECT(reinterpret_cast<float *>(result_0.buffer())[i] ==
+                               reinterpret_cast<float *>(result_1.buffer())[i],
                            framework::LogLevel::ERRORS);
     }
 }
@@ -192,111 +195,216 @@ TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
     auto result_1 = run_conv();
     for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
     {
-        ARM_COMPUTE_EXPECT((reinterpret_cast<float *>(result_0.buffer()))[i] == (reinterpret_cast<float *>(result_1.buffer()))[i],
+        ARM_COMPUTE_EXPECT((reinterpret_cast<float *>(result_0.buffer()))[i] ==
+                               (reinterpret_cast<float *>(result_1.buffer()))[i],
                            framework::LogLevel::ERRORS);
     };
 }
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
-               make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
-                                                     }),
-               make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
-                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
-                                                     }),
-               make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
-                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
-                                                     }),
-               make("Expected", { false, true })),
-               lhs_info, rhs_info, output_info, expected)
+DATA_TEST_CASE(ValidateAllDataTypes,
+               framework::DatasetMode::ALL,
+               combine(
+                    datasets::AllDataTypes("DataType"),
+                    datasets::AllDataTypes("DataType"),
+                    datasets::AllDataTypes("DataType"),
+                    make("fixed_format", {true, false})),
+               lhs_data_type, rhs_data_type, output_data_type, fixed_format)
 {
-    const auto gemm_info = GEMMInfo();
+    auto gemm_info = GEMMInfo();
+    auto asm_info = arm_compute::cpu::AsmGemmInfo();
+    auto lhs_info = TensorInfo(TensorShape(21U, 13U), 1, lhs_data_type);
+    auto rhs_info = TensorInfo(TensorShape(33U, 21U), 1, rhs_data_type);
+    auto output_info = TensorInfo(TensorShape(33U, 13U), 1, output_data_type);
+    gemm_info.set_fixed_format(fixed_format);
+    asm_info.fixed_format = fixed_format;
+
+    if (fixed_format) {
+        WeightFormat wf = WeightFormat::ANY;
+        gemm_info.set_accumulate(false);
+        asm_info.accumulate = false;
+        gemm_info.set_weight_format(wf);
+        asm_info.weight_format = wf;
+        gemm_info.set_fast_math(rhs_data_type == DataType::BFLOAT16 && fixed_format);
+        asm_info.fast_mode = rhs_data_type == DataType::BFLOAT16 && fixed_format;
+
+        experimental::op::ll::CpuGemmAssemblyDispatch::has_opt_impl(wf, &lhs_info, &rhs_info, nullptr, &output_info, gemm_info);
+        gemm_info.set_weight_format(wf);
+        asm_info.weight_format = wf;
+        rhs_info.set_data_layout(DataLayout::NCHW);
+    }
+
+    const auto supports = {
+        std::make_tuple(DataType::F32, DataType::F32, DataType::F32),
+        std::make_tuple(DataType::F16, DataType::F16, DataType::F16),
+        std::make_tuple(DataType::BFLOAT16, DataType::BFLOAT16, DataType::BFLOAT16),
+        std::make_tuple(DataType::BFLOAT16, DataType::BFLOAT16, DataType::F32),
+        std::make_tuple(DataType::F32, DataType::BFLOAT16, DataType::F32),
+    };
+    const auto config = std::make_tuple(lhs_data_type, rhs_data_type, output_data_type);
+
+    bool expected = arm_compute::cpu::CpuGemmAssemblyDispatch::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), asm_info) &&
+                    (std::find(supports.begin(), supports.end(), config) != supports.end());
+
     bool is_valid = bool(experimental::op::ll::CpuGemmAssemblyDispatch::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), gemm_info));
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 
 template <typename T>
-using CpuGemmAssemblyDispatchFixture = CpuGemmAssemblyDispatchValidationFixture<Tensor, Accessor, experimental::op::ll::CpuGemmAssemblyDispatch, T, false /* accumulate */>;
+using CpuGemmAssemblyDispatchFixture = CpuGemmAssemblyDispatchValidationFixture<Tensor, Accessor, experimental::op::ll::CpuGemmAssemblyDispatch, T>;
+
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 template <typename T>
-using CpuGemmAccumulateFixture = CpuGemmAssemblyDispatchValidationFixture<Tensor, Accessor, experimental::op::ll::CpuGemmAssemblyDispatch, T, true /* accumulate */>;
+using CpuGemmFixedFormatFixture = CpuGemmAssemblyDispatchFixedFormatFixture<Tensor, Accessor, experimental::op::ll::CpuGemmAssemblyDispatch, T>;
+#endif // ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
 
 TEST_SUITE(Float)
 
-DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
-                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
-                                                                     make("In1", { TensorShape(33U, 21U) }),
-                                                                     make("Dst", { TensorShape(33U, 13U) })),
-                                                                     zip(
-                                                                     make("is_c_null", { false, false, false, true }),
-                                                                     make("Expected", { true, true, true, true }))),
-               shape_a, shape_b, shape_dst, is_c_null, expected)
+DATA_TEST_CASE(ValidateAccumulate,
+               framework::DatasetMode::ALL,
+               combine(
+                    make("In0",{ TensorShape(21U, 13U) }),
+                    make("In1", { TensorShape(33U, 21U) }),
+                    make("Dst", { TensorShape(33U, 13U) }),
+                    make("Expected", { true })),
+               shape_a, shape_b, shape_dst, expected)
 {
-    ARM_COMPUTE_UNUSED(is_c_null);
     /* Accumulation test for GEMM kernels */
     // Create tensors
     TensorInfo in_a(shape_a, 1, DataType::F32);
     TensorInfo in_b(shape_b, 1, DataType::F32);
-    TensorInfo in_c(shape_dst, 1, DataType::F32);
     TensorInfo dst(shape_dst, 1, DataType::F32);
 
     GEMMInfo gemm_info = GEMMInfo();
     gemm_info.set_accumulate(true);
 
     // Validate accumulation
-    Status status = experimental::op::ll::CpuGemmAssemblyDispatch::validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, gemm_info);
+    Status status = experimental::op::ll::CpuGemmAssemblyDispatch::validate(&in_a, &in_b, nullptr, &dst, gemm_info);
     ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
 }
 
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAssemblyDispatchFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(),
-                                                                                                         make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CpuGemmAssemblyDispatchFixture<half>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallGEMMDataset(),
+                            make("DataType", DataType::F16),
+                            make("Accumulate", false),
+                            make("Pretranspose_B", {false, true}),
+                            make("ActivationInfo", {
+                            ActivationLayerInfo(),
+                            ActivationLayerInfo(ActivationFunction::RELU),
+                            ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f),
+                            ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f)
+                        })))
 {
-    // Validate output
-    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+    if(CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAssemblyDispatchFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(),
-                                                                                                       make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       CpuGemmAssemblyDispatchFixture<half>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeGEMMDataset(),
+                            make("DataType", DataType::F16),
+                            make("Accumulate", false),
+                            make("Pretranspose_B", {false, true}),
+                            make("ActivationInfo", {
+                                ActivationLayerInfo(),
+                                ActivationLayerInfo(ActivationFunction::RELU),
+                                ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f),
+                                ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f)
+                            })))
 {
-    // Validate output
-    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+    if(CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
 }
 
-
 TEST_SUITE_END() // FP16
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAssemblyDispatchFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(),
-                                                                                                            make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CpuGemmAssemblyDispatchFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallGEMMDataset(),
+                            make("DataType", DataType::F32),
+                            make("Accumulate", {false, true}),
+                            make("Pretranspose_B", {false, true}),
+                            make("ActivationInfo", {
+                                ActivationLayerInfo(),
+                                ActivationLayerInfo(ActivationFunction::RELU),
+                                ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f),
+                                ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f)
+                       })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAssemblyDispatchFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(),
-                                                                                                        make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       CpuGemmAssemblyDispatchFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeGEMMDataset(),
+                            make("DataType", DataType::F32),
+                            make("Accumulate", {false, true}),
+                            make("Pretranspose_B", {false, true}),
+                            make("ActivationInfo", {
+                                ActivationLayerInfo(),
+                                ActivationLayerInfo(ActivationFunction::RELU),
+                                ActivationLayerInfo(ActivationFunction::BOUNDED_RELU, 1.f),
+                                ActivationLayerInfo(ActivationFunction::LU_BOUNDED_RELU, 1.f)
+                       })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
 
 
-TEST_SUITE(ACCUMULATE)
-FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmAccumulateFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallAccumulateGEMMDataset(),
-                                                                                                        make("DataType", DataType::F32)))
+#ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS
+
+TEST_SUITE(FIXED_FORMAT)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CpuGemmFixedFormatFixture<float>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                            datasets::SmallGEMMDataset(),
+                            make("DataType", DataType::F32)
+                        ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CpuGemmAccumulateFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeAccumulateGEMMDataset(),
-                                                                                                        make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       CpuGemmFixedFormatFixture<float>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(
+                            datasets::LargeGEMMDataset(),
+                            make("DataType", DataType::F32)
+                        ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-TEST_SUITE_END() // ACCUMULATE
+TEST_SUITE_END() // FIXED_FORMAT
+#endif // ARM_COMPUTE_FIXED_FORMAT_KERNELS
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 
diff --git a/tests/validation/runtime/experimental/operators/CpuDequantize.cpp b/tests/validation/runtime/experimental/operators/CpuDequantize.cpp
new file mode 100644
index 0000000000..d6c74c4fa2
--- /dev/null
+++ b/tests/validation/runtime/experimental/operators/CpuDequantize.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/experimental/operators/CpuDequantize.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/DatatypeDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CpuDequantizeFixture.h"
+
+
+/*
+ * Tests for arm_compute::experimental::op::CpuDequantize which is a shallow wrapper for arm_compute::cpu::CpuDequantize.
+ * Any future testing to the functionalities of cpu::CpuDequantize will be tested in tests/NEON/DequantizationLayer.cpp given that experimental::op::CpuDequantize remain a shallow wrapper.
+*/
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+        using   framework::dataset::make;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+#else  /* ARM_COMPUTE_ENABLE_FP16 */
+const auto data_types = framework::dataset::make("DataType", { DataType::F32 });
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
+const auto dataset_quant_f32 = combine(datasets::SmallShapes(),
+                                        datasets::QuantizedTypes(),
+                                             make("DataType", DataType::F32),
+                                     make("DataLayout", { DataLayout::NCHW })
+                                     );
+
+const auto dataset_quant_asymm_signed_f32 = combine(datasets::SmallShapes(),
+                                                                  make("QuantizedTypes", { DataType::QASYMM8_SIGNED }),
+                                                          make("DataType", DataType::F32),
+                                                  make("DataLayout", { DataLayout::NCHW })
+                                                  );
+
+const auto dataset_quant_per_channel_f32 = combine(datasets::SmallShapes(), datasets::QuantizedPerChannelTypes(),
+                                                         make("DataType", DataType::F32),
+                                                 make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+);
+
+const auto dataset_precommit_f32 = concat(concat(dataset_quant_f32, dataset_quant_per_channel_f32), dataset_quant_asymm_signed_f32);
+
+
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
+TEST_SUITE(CpuDequantize)
+
+
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+      make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),      // Wrong input data type
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),  // Wrong output data type
+                                                TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::QASYMM8),   // Missmatching shapes
+                                                TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::QASYMM8),  // Valid
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),  // Valid
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8_SIGNED),  // Valid
+        }),
+      make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U8),
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(17U, 16U, 16U, 5U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
+                                                TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
+        }),
+      make("Expected", { false, false, false, true, true, true })),
+        input_info, output_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(arm_compute::experimental::op::CpuDequantize::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
+}
+// // clang-format on
+
+using arm_compute::experimental::op::CpuDequantize;
+template <typename T>
+using CpuDequantizeFixture = CpuDequantizationValidationFixture<Tensor, Accessor, CpuDequantize, T>;
+
+
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuDequantizeFixture<float>, framework::DatasetMode::ALL, dataset_precommit_f32)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+
+TEST_SUITE_END() // CpuDequantize
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // NENO
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp b/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp
new file mode 100644
index 0000000000..b8e6830700
--- /dev/null
+++ b/tests/validation/runtime/experimental/operators/CpuGEMMLowp.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/experimental/operators/CpuGEMMLowp.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/LargeGEMMLowpDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallGEMMLowpDataset.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/fixtures/CpuGEMMLowpFixture.h"
+
+/*
+
+ * Tests for arm_compute::experimental::op::CpuGEMMLowp which is a shallow wrapper for
+ * arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore Any future testing to the functionalities of arm_compute::cpu::CpuGemmLowpMatrixMultiplyCore will
+ * be tested in tests/validation/NEON/GEMMLowp.cpp given that op::CpuGEMMLowp remain a shallow wrapper.
+*/
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+using framework::dataset::make;
+
+TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
+TEST_SUITE(CpuGEMMLowp)
+
+using CpuGEMMLowpFixture = CpuGEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, arm_compute::experimental::op::CpuGEMMLowp>;
+
+using framework::dataset::make;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),
+               shape_a, shape_b, shape_c, a_offset, b_offset)
+{
+    // Create tensors
+    Tensor a = create_tensor<Tensor>(shape_a, DataType::QASYMM8);
+    Tensor b = create_tensor<Tensor>(shape_b, DataType::QASYMM8);
+    Tensor c = create_tensor<Tensor>(shape_c, DataType::S32);
+
+    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
+    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    arm_compute::experimental::op::CpuGEMMLowp gemmlowp_mm;
+    gemmlowp_mm.configure(a.info(), b.info(), nullptr, c.info());
+
+    // Validate padding is zero
+    validate(a.info()->padding(), PaddingSize());
+    validate(b.info()->padding(), PaddingSize());
+    validate(c.info()->padding(), PaddingSize());
+}
+// accumulation is not supported for Int8/UInt8 in aarch32
+#ifdef __aarch64__
+DATA_TEST_CASE(ValidateAccumulate, framework::DatasetMode::ALL, combine(
+                                                                    zip(
+                                                                     make("In0",{ TensorShape(21U, 1U) }),
+                                                                     make("In1", { TensorShape(1U, 21U) }),
+                                                                     make("Dst", { TensorShape(1U, 1U) }),
+                                                                     make("a_offset", { -2 }),
+                                                                     make("b_offset", { 13 })
+                                                                    ),
+                                                                    zip(
+                                                                     make("OutputDataType", {  DataType::S32,  DataType::QASYMM8, DataType::QASYMM8_SIGNED}),
+                                                                     make("Expected", { true, false, false })
+                                                                    )),
+               shape_a, shape_b, shape_dst, a_offset, b_offset, output_data_type, expected)
+{
+    DataType input_data_type = (output_data_type == DataType::S32 ? DataType::QASYMM8 : output_data_type);
+    // Accumulation test for GEMM kernels
+    TensorInfo a(shape_a, 1, input_data_type, QuantizationInfo(1.0f / 255, a_offset));
+    TensorInfo b(shape_b, 1, input_data_type, QuantizationInfo(1.0f / 255, b_offset));
+    TensorInfo dst(shape_dst, 1, output_data_type, QuantizationInfo());
+
+    // Create and configure function
+    GEMMInfo gemm_info = GEMMInfo();
+    gemm_info.set_accumulate(true);
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        GEMMLowpOutputStageInfo gemmLowpOutputStageInfo = GEMMLowpOutputStageInfo();
+        gemmLowpOutputStageInfo.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+
+        gemm_info.set_gemmlowp_output_stage(gemmLowpOutputStageInfo);
+    }
+
+    arm_compute::experimental::op::CpuGEMMLowp gemmlowp_mm;
+    Status status = gemmlowp_mm.validate(&a, &b, nullptr, &dst, gemm_info);
+
+    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+}
+#endif // __arch64__
+
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type
+                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
+                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
+                                             TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
+                                             TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8_SIGNED, QuantizationInfo(1.f/255, 10)), // Invalid types
+                                          }),
+    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                          }),
+    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
+                                           }),
+    make("Expected", { true, false, false, false, true, false })),
+    a_info, b_info, output_info, expected)
+{
+    // Lock tensors
+    Status status = arm_compute::experimental::op::CpuGEMMLowp::validate(&a_info.clone()->set_is_resizable(false),
+                                                            &b_info.clone()->set_is_resizable(false),
+                                                            nullptr,
+                                                            &output_info.clone()->set_is_resizable(false));
+    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+
+/** Test case for memory injection in @ref arm_compute::experimental::op::CpuGEMMLowp.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+    auto gemm     = std::make_unique<arm_compute::experimental::op::CpuGEMMLowp>();
+    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);
+    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);
+    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);
+    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));
+    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));
+    const auto gemm_info = GEMMInfo{};
+    gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);
+
+    // The LHS are newly created every call of this lambda function
+    auto a   = create_tensor<Tensor>(a_info);
+    auto b   = create_tensor<Tensor>(b_info);
+    auto dst = create_tensor<Tensor>(dst_info);
+    a.allocator()->allocate();
+    b.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    ITensorPack run_pack =
+    {
+        { TensorType::ACL_SRC_0, &a },
+        { TensorType::ACL_SRC_1, &b },
+        { TensorType::ACL_DST, &dst }
+    };
+    ITensorPack prep_pack =
+    {
+        { TensorType::ACL_SRC_1, &b },
+    };
+
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
+    allocate_tensors(gemm->workspace(), ws);
+
+    auto run_gemm = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));
+        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));
+        // This operator is configured once and captured by this lambda.
+        gemm->prepare(prep_pack);
+        gemm->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_gemm();
+    auto result_1 = run_gemm();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGEMMLowpFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+
+TEST_SUITE_END() // CpuGEMMLowp
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // CpuGEMMLowp
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp
index 9d87a3d2e5..14c8affe27 100644
--- a/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuGemmConv2d.cpp
@@ -48,9 +48,18 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
+
 namespace
 {
 const RelativeTolerance<float> rel_tolerance_f32(0.01f);
+#ifdef ARM_COMPUTE_ENABLE_SME
+// TODO(COMPMID-6011): SME kernels and the reference model use different rounding mode.
+// Temporarily increase the tolerance for quantized data.
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(1.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#else  // ARM_COMPUTE_ENABLE_SME
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+#endif // ARM_COMPUTE_ENABLE_SME
 } // namespace
 
 TEST_SUITE(NEON)
@@ -117,6 +126,8 @@ TEST_CASE(OpCpuGemmConv2dMemoryInjection, framework::DatasetMode::ALL)
 }
 
 using CpuGemmConv2dFixture = CpuGemmConv2dValidationFixture<Tensor, Accessor, experimental::op::CpuGemmConv2d>;
+template <typename T>
+using CpuGemmConv2dStaticQuantFixture = CpuGemmConv2dForUpdatedStaticQuantInfoAfterConfigureFixture<Tensor, Accessor, experimental::op::CpuGemmConv2d, T>;
 
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(SmokeTest,
@@ -129,6 +140,48 @@ FIXTURE_DATA_TEST_CASE(SmokeTest,
 }
 TEST_SUITE_END() // F32
 
+#ifdef __aarch64__
+
+const auto QuantizedActivationFunctionsDataset = make("ActivationInfo",
+{
+    ActivationLayerInfo(),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+
+TEST_SUITE(Quantized)
+
+TEST_SUITE(UpdateStaticQuantInfoAfterConfigure)
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGemmConv2dStaticQuantFixture<int8_t>, framework::DatasetMode::ALL, combine(datasets::TinyConvolutionLayerDataset(),
+                                                                                                                      make("ReshapeWeights", { true }),
+                                                                                                                      make("DataType", DataType::QASYMM8_SIGNED),
+                                                                                                                      make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+                                                                                                                      make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(0.01f, -10) }),
+                                                                                                                      QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuGemmConv2dStaticQuantFixture<uint8_t>, framework::DatasetMode::ALL, combine(datasets::TinyConvolutionLayerDataset(),
+                                                                                                                       make("ReshapeWeights", { true }),
+                                                                                                                       make("DataType", DataType::QASYMM8),
+                                                                                                                       make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }),
+                                                                                                                       make("QuantizationInfoIfActivationEnabled", { QuantizationInfo(2.f / 255.f, 10) }),
+                                                                                                                       QuantizedActivationFunctionsDataset))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // UpdateStaticQuantInfoAfterConfigure
+
+TEST_SUITE_END() // Quantized
+#endif // __aarch64__
+
 TEST_SUITE_END() // CpuGemmConv2d
 TEST_SUITE_END() // OPERATORS
 TEST_SUITE_END() // NEON
diff --git a/tests/validation/runtime/experimental/operators/CpuQuantize.cpp b/tests/validation/runtime/experimental/operators/CpuQuantize.cpp
new file mode 100644
index 0000000000..9fd4bb9378
--- /dev/null
+++ b/tests/validation/runtime/experimental/operators/CpuQuantize.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/experimental/operators/CpuQuantize.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CpuQuantizeFixture.h"
+
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+
+/*
+ * Tests for arm_compute::experimental::op::CpuQuantize which is a shallow wrapper for
+ * arm_compute::cpu::CpuQuantization. Any future testing to the functionalities of cpu::CpuQuantize
+ * will be tested in tests/NEON/QuantizationLayer.cpp given that op::CpuQuantize remain a
+ * shallow wrapper.
+*/
+using arm_compute::experimental::op::CpuQuantize;
+using arm_compute::test::validation::CpuQuantizationValidationFixture;
+namespace
+{
+/** Tolerance for quantization */
+constexpr AbsoluteTolerance<uint8_t>  tolerance_u8(1);  /**< Tolerance value for comparing reference's output against implementation's output for QASYMM8 data types */
+const auto                            QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes());
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
+TEST_SUITE(CpuQuantize)
+
+using framework::dataset::make ;
+
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+               make("InputInfo", { TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),  // Wrong output data type
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),  // Wrong output data type
+                                                       TensorInfo(TensorShape(16U, 16U, 2U, 5U), 1, DataType::F32),  // Mismatching shapes
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),  // Valid
+                                                     }),
+               make("OutputInfo",{ TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::U16),
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),
+                                                       TensorInfo(TensorShape(16U, 16U, 16U, 5U), 1, DataType::QASYMM8),
+                                                     })),
+               make("Expected", { false, false, false, true})),
+               input_info, output_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CpuQuantize::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+
+template <typename T>
+using CpuQuantizeQASYMM8Fixture = CpuQuantizationValidationFixture<Tensor, Accessor, CpuQuantize, T, uint8_t>;
+
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuQuantizeQASYMM8Fixture<float>, framework::DatasetMode::ALL, combine(QuantizationSmallShapes,
+                       make("DataType", DataType::F32),
+                       make("DataTypeOut", { DataType::QASYMM8 }),
+                      make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })
+                      ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u8);
+}
+TEST_SUITE_END() // CpuQuantize
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 2d106d849a..08d00fc96a 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -3690,6 +3690,17 @@ inline std::string to_string(const arm_compute::ScatterInfo &info)
     return str.str();
 }
 
+/** Formatted output of the bool data type.
+ *
+ * @param[in] info bool type to output.
+ *
+ * @return Formatted string
+ */
+inline std::string to_string(const bool &info)
+{
+    return info ? "true" : "false";
+}
+
 } // namespace arm_compute
 
 #endif // ACL_UTILS_TYPEPRINTER_H

src	dst -
U8	S8, U16, S16, U32, S32, F16, F32 -
S8	U8, U16, S16, U32, S32, F16, F32 -
U16	U8, S8, S16, U32, S32, F16, F32 -
S16	U8, S8, U16, U32, S32, F16, F32 -
U32	U8, S8, U16, S16, S32, F16, F32 -
S32	U8, S8, U16, S16, U32, F16, F32 -
U64	U8, S8, U16, S16, U32, S32, F16, F32 -
S64	U8, S8, U16, S16, U32, S32, F16, F32 -
F16	U8, S8, U16, S16, S32, U32, F32 -
F32	U8, S8, U16, S16, S32, U32, F16 +
U8	S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized +
S8	U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
U16	U8, S8, S16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 +
S16	U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16 +
U32	U8, S8, U16, S16, S32, F16, F32, All Quantized +
S32	U8, S8, U16, S16, U32, F16, F32, All Quantized +
U64	U8, S8, U16, S16, U32, S32, F16, F32, All Quantized +
S64	U8, S8, U16, S16, U32, S32, F16, F32, All Quantized +
F16	U8, S8, U16, S16, S32, U32, F32, All Quantized +
F32	U8, S8, U16, S16, S32, U32, F16, All Quantized +
QASYMM8	S8, U16, S16, U32, S32, F16, F32, QASYMM8_SIGNED, QSYMM8, QSYMM8_PER_CHANNEL, 16-bit Quantized +
QASYMM8_SIGNED	U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
QSYMM8	U8, U16, S16, U32, S32, F16, F32, QASYMM8, 16-bit Quantized +
QSYMM8_PER_CHANNEL	U8, U16, S16, U32, S32, F16, F32, 16-bit Quantized +
QASYMM16	U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QSYMM16 +
QSYMM16	U8, S8, U16, U32, S32, F16, F32, 8-bit Quantized, QASYMM16