diff --git a/.ci/daint.cscs.ch/Jenkinsfile b/.ci/daint.cscs.ch/Jenkinsfile
index 4ce6d62ed1b..73a0e486146 100644
--- a/.ci/daint.cscs.ch/Jenkinsfile
+++ b/.ci/daint.cscs.ch/Jenkinsfile
@@ -59,6 +59,20 @@ pipeline {
                         }
                     }
                 }
+                stage("OpenCL") {
+                    stages {
+                        stage('build') {
+                            steps {
+                                run_batch("0:15:00", "ocl", "build")
+                            }
+                        }
+//                        stage('test') {
+//                            steps {
+//                                run_batch("1:00:00", "ocl", "test")
+//                            }
+//                        }
+                    }
+                }
                 stage("Intel") {
                     stages {
                         stage('build') {
diff --git a/.ci/daint.cscs.ch/cray.build.sh b/.ci/daint.cscs.ch/cray.build.sh
index 3f2deff403a..ed64c1ba521 100755
--- a/.ci/daint.cscs.ch/cray.build.sh
+++ b/.ci/daint.cscs.ch/cray.build.sh
@@ -30,7 +30,7 @@ cd "${SCRATCH}/${BUILD_TAG}.cray"
 
 cmake \
     -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
-    -DUSE_CUDA=ON \
+    -DUSE_ACCEL=cuda \
     -DWITH_GPU=P100 \
     -DBLAS_FOUND=ON -DBLAS_LIBRARIES="-lsci_cray_mpi_mp" \
     -DLAPACK_FOUND=ON -DLAPACK_LIBRARIES="-lsci_cray_mpi_mp" \
diff --git a/.ci/daint.cscs.ch/gnu.build.sh b/.ci/daint.cscs.ch/gnu.build.sh
index d104d765038..84157a2a743 100755
--- a/.ci/daint.cscs.ch/gnu.build.sh
+++ b/.ci/daint.cscs.ch/gnu.build.sh
@@ -28,7 +28,7 @@ cd "${SCRATCH}/${BUILD_TAG}.gnu"
 cmake \
     -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
     -DCMAKE_CROSSCOMPILING_EMULATOR="" \
-    -DUSE_CUDA=ON \
+    -DUSE_ACCEL=cuda \
     -DWITH_GPU=P100 \
     -DBLAS_FOUND=ON -DBLAS_LIBRARIES="-lsci_gnu_mpi_mp" \
     -DLAPACK_FOUND=ON -DLAPACK_LIBRARIES="-lsci_gnu_mpi_mp" \
diff --git a/.ci/daint.cscs.ch/intel.build.sh b/.ci/daint.cscs.ch/intel.build.sh
index c91ae23f77f..1b87da2f61b 100755
--- a/.ci/daint.cscs.ch/intel.build.sh
+++ b/.ci/daint.cscs.ch/intel.build.sh
@@ -31,7 +31,7 @@ cd "${SCRATCH}/${BUILD_TAG}.intel"
 
 cmake \
     -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
-    -DUSE_CUDA=ON \
+    -DUSE_ACCEL=cuda \
     -DWITH_GPU=P100 \
     -DBLAS_FOUND=ON -DBLAS_LIBRARIES="-lsci_intel_mpi_mp" \
     -DLAPACK_FOUND=ON -DLAPACK_LIBRARIES="-lsci_intel_mpi_mp" \
diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
new file mode 100755
index 00000000000..5d0676e0788
--- /dev/null
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -0,0 +1,55 @@
+#!/bin/bash -l
+
+#SBATCH --export=ALL
+#SBATCH --exclusive
+#SBATCH --constraint="mc"
+#SBATCH --partition="cscsci"
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=3
+#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+module swap PrgEnv-cray PrgEnv-gnu
+module load daint-gpu cudatoolkit CMake/3.14.5
+module unload cray-libsci_acc
+module list
+
+# Checkout and build LIBXSMM
+if [ ! -d "${HOME}/libxsmm" ]; then
+  cd "${HOME}"
+  git clone https://github.com/hfp/libxsmm.git
+fi
+cd "${HOME}/libxsmm"
+git checkout 02d6ab213a35d5fc2f6454c3b465598b0c086c17
+make -j
+cd ..
+
+set -o xtrace  # do not set earlier to avoid noise from module
+
+umask 0002  # make sure group members can access the data
+
+mkdir -p "${SCRATCH}/${BUILD_TAG}.ocl"
+chmod 0775 "${SCRATCH}/${BUILD_TAG}.ocl"
+cd "${SCRATCH}/${BUILD_TAG}.ocl"
+
+# help CMake to find the OpenCL implementation
+export NVSDKCOMPUTE_ROOT=${CUDATOOLKIT_HOME}
+export PKG_CONFIG_PATH=${HOME}/libxsmm/lib:${PKG_CONFIG_PATH}
+
+cmake \
+    -DCMAKE_SYSTEM_NAME=CrayLinuxEnvironment \
+    -DCMAKE_CROSSCOMPILING_EMULATOR="" \
+    -DUSE_ACCEL=opencl -DUSE_SMM=libxsmm \
+    -DOpenCL_LIBRARY="${CUDATOOLKIT_HOME}/lib64/libOpenCL.so" \
+    -DBLAS_FOUND=ON -DBLAS_LIBRARIES="-lsci_gnu_mpi_mp" \
+    -DLAPACK_FOUND=ON -DLAPACK_LIBRARIES="-lsci_gnu_mpi_mp" \
+    -DMPIEXEC_EXECUTABLE="$(command -v srun)" \
+    -DTEST_MPI_RANKS="${SLURM_NTASKS}" \
+    -DTEST_OMP_THREADS="${SLURM_CPUS_PER_TASK}" \
+    "${WORKSPACE}" |& tee -a "${STAGE_NAME}.out"
+
+make VERBOSE=1 -j |& tee -a "${STAGE_NAME}.out"
diff --git a/.ci/daint.cscs.ch/ocl.test.sh b/.ci/daint.cscs.ch/ocl.test.sh
new file mode 100755
index 00000000000..19df5c35f22
--- /dev/null
+++ b/.ci/daint.cscs.ch/ocl.test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash -l
+
+#SBATCH --export=ALL
+#SBATCH --exclusive
+#SBATCH --constraint="gpu"
+#SBATCH --partition="cscsci"
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=3
+#SBATCH --ntasks-per-core=1 # 1=no HT, 2=HT
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+module swap PrgEnv-cray PrgEnv-gnu
+module load daint-gpu cudatoolkit CMake/3.14.5
+module unload cray-libsci_acc
+module list
+
+set -o xtrace  # do not set earlier to avoid noise from module
+
+umask 0002  # make sure group members can access the data
+
+mkdir -p "${SCRATCH}/${BUILD_TAG}.ocl"
+chmod 0775 "${SCRATCH}/${BUILD_TAG}.ocl"
+cd "${SCRATCH}/${BUILD_TAG}.ocl"
+
+export CRAY_CUDA_MPS=1 # enable the CUDA proxy for MPI+CUDA
+export OMP_PROC_BIND=TRUE # set thread affinity
+# OMP_NUM_THREADS is set by cmake
+
+# document the current environment
+env |& tee -a "${STAGE_NAME}.out"
+
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="--timeout 900" |& tee -a "${STAGE_NAME}.out"
diff --git a/.github/workflows/testing-linux.yml b/.github/workflows/testing-linux.yml
index 7c8fd2d4aad..0572a377c87 100644
--- a/.github/workflows/testing-linux.yml
+++ b/.github/workflows/testing-linux.yml
@@ -101,7 +101,7 @@ jobs:
         cmake -G Ninja \
           -DCMAKE_BUILD_TYPE=Release \
           -DUSE_${{ matrix.use_openmp }} \
-          -DUSE_HIP=ON \
+          -DUSE_ACCEL=hip \
           -DWITH_GPU=Mi50 \
           ..
     - name: Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7f12b48606..544399b33fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,15 +90,10 @@ set(USE_SMM
           "Small Matrix Multiplication implementation to use (default: blas)")
 set_property(CACHE USE_SMM PROPERTY STRINGS blas libxsmm)
 
-option(USE_CUDA "Build with CUDA support" OFF)
-option(USE_HIP "Build with HIP support" OFF)
-# USE_CUDA and USE_HIP are mutually exclusive options: we either compile with
-# nvcc OR with hipcc
-if (USE_CUDA AND USE_HIP)
-  message(
-    FATAL_ERROR
-      "USE_CUDA and USE_HIP options are mutually exclusive. Please choose one.")
-endif ()
+set(USE_ACCEL
+    ""
+    CACHE STRING "Build with acceleration support (default: none)")
+set_property(CACHE USE_ACCEL PROPERTY STRINGS "" opencl cuda hip)
 
 set(SUPPORTED_CUDA_ARCHITECTURES K20X K40 K80 P100 V100)
 set(SUPPORTED_HIP_ARCHITECTURES Mi50)
@@ -117,21 +112,27 @@ enable_language(Fortran)
 
 if (WITH_C_API AND WITH_EXAMPLES)
   enable_language(CXX)
+  enable_language(C)
 endif ()
 
-# we're always using at least C++11
+# always use at least C++11
 set(CMAKE_CXX_STANDARD 11)
 
 # =================================================================================================
 # PACKAGE DISCOVERY (compiler configuration can impact package discovery)
+find_package(PkgConfig)
 
-# =================================== OpenMP and OpenMP/offload backend
+# =================================== OpenMP
 if (USE_OPENMP)
   find_package(OpenMP REQUIRED)
 endif ()
 
+# =================================== LIBXSMM (rely on pkg-config)
+if ((USE_SMM MATCHES "libxsmm") OR (USE_ACCEL MATCHES "opencl"))
+  pkg_check_modules(LIBXSMM IMPORTED_TARGET GLOBAL libxsmmf)
+endif ()
+
 # =================================== BLAS & LAPACK, PkgConfig
-find_package(PkgConfig)
 find_package(LAPACK REQUIRED) # needed for some of the integrated test routines,
                               # also calls find_package(BLAS)
 
@@ -141,8 +142,7 @@ find_package(LAPACK REQUIRED) # needed for some of the integrated test routines,
 # environment for a python interpreter before searching elsewhere in the system.
 # In CMake <3.15, the system is searched before the virtual environment.
 if (NOT Python_EXECUTABLE)
-  # If the python interpreter isn't specified as a command line option, look for
-  # it:
+  # If the python interpreter is not specified (command line), try finding it:
   find_package(
     Python
     COMPONENTS Interpreter
@@ -185,15 +185,35 @@ endif ()
 if (USE_SMM MATCHES "blas")
   message("-- Using BLAS for Small Matrix Multiplication")
 elseif (USE_SMM MATCHES "libxsmm")
-  # rely on pkg-config in order to link against libxsmm
-  pkg_check_modules(deps REQUIRED IMPORTED_TARGET GLOBAL libxsmmf)
-  message("-- Using libxsmm for Small Matrix Multiplication")
+  if (LIBXSMM_FOUND)
+    message("-- Using LIBXSMM for Small Matrix Multiplication")
+  else ()
+    message(
+      FATAL_ERROR
+        "LIBXSMM is not found but requested (USE_SMM). "
+        "Please install PkgConfig, build LIBXSMM, and "
+        "set PKG_CONFIG_PATH=/path/to/libxsmm/lib")
+  endif ()
 else ()
   message(FATAL_ERROR "Unknown SMM library specified")
 endif ()
 
-# =================================== GPU backend
-if (USE_CUDA OR USE_HIP)
+# =================================== GPU backends
+if (USE_ACCEL MATCHES "opencl")
+  if (NOT LIBXSMM_FOUND)
+    message(
+      FATAL_ERROR
+        "LIBXSMM is not found but required for "
+        "LIBSMM based on the ACC/OpenCL backend. "
+        "Please install PkgConfig, LIBXSMM, and "
+        "set PKG_CONFIG_PATH=/path/to/libxsmm/lib")
+  endif ()
+
+  find_package(OpenCL REQUIRED)
+  enable_language(C)
+endif ()
+
+if (USE_ACCEL MATCHES "cuda|hip")
   enable_language(CXX)
   set(GPU_ARCH_NUMBER_K20X 35)
   set(GPU_ARCH_NUMBER_K40 35)
@@ -203,8 +223,7 @@ if (USE_CUDA OR USE_HIP)
   set(GPU_ARCH_NUMBER_Mi50 gfx906)
 endif ()
 
-if (USE_CUDA)
-
+if (USE_ACCEL MATCHES "cuda")
   enable_language(CUDA)
   if (CMAKE_CUDA_COMPILER_VERSION LESS 5.5)
     message(FATAL_ERROR "CUDA version >= 5.5 is required.")
@@ -214,9 +233,8 @@ if (USE_CUDA)
   list(FIND SUPPORTED_CUDA_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED)
   if (GPU_SUPPORTED EQUAL -1)
     message(
-      FATAL_ERROR
-        "GPU architecture requested (${WITH_GPU}) is not supported. Please choose from: ${SUPPORTED_CUDA_ARCHITECTURES}"
-    )
+      FATAL_ERROR "GPU architecture requested (${WITH_GPU}) is not supported. "
+                  "Please choose from: ${SUPPORTED_CUDA_ARCHITECTURES}")
   endif ()
 
   # assume that the backend compiler for nvcc understands the -std=c++11
@@ -243,7 +261,6 @@ if (USE_CUDA)
   else ()
     message(STATUS "Found cuBLAS: ${CUBLAS}")
   endif ()
-
   if (WITH_CUDA_PROFILING)
     find_library(
       CUDA_NVTOOLSEXT nvToolsExt
@@ -257,15 +274,13 @@ endif ()
 
 # inspired from
 # https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/2_Cookbook/12_cmake_hip_add_executable
-if (USE_HIP)
-
+if (USE_ACCEL MATCHES "hip")
   # Make sure the GPU required is supported
   list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED)
   if (GPU_SUPPORTED EQUAL -1)
     message(
-      FATAL_ERROR
-        "GPU architecture requested (${WITH_GPU}) is not supported. Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}"
-    )
+      FATAL_ERROR "GPU architecture requested (${WITH_GPU}) is not supported. "
+                  "Please choose from: ${SUPPORTED_HIP_ARCHITECTURES}")
   endif ()
 
   # Set path to HIP installation, include HIP cmake utilities
diff --git a/cmake/CompilerConfiguration.cmake b/cmake/CompilerConfiguration.cmake
index c5873f431d9..af29b4dd0a2 100644
--- a/cmake/CompilerConfiguration.cmake
+++ b/cmake/CompilerConfiguration.cmake
@@ -88,3 +88,8 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c
   message("-- CMAKE_CXX_COMPILER_ID: " ${CMAKE_CXX_COMPILER_ID})
   message("-- CMAKE_CXX_COMPILER full path: " ${CMAKE_CXX_COMPILER})
 endif ()
+
+# inherit C flags from CXX
+set(CMAKE_C_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+set(CMAKE_C_FLAGS_COVERAGE ${CMAKE_CXX_FLAGS_COVERAGE})
+set(CMAKE_C_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md
index c0efd0d406b..88f01310b2b 100644
--- a/docs/guide/2-user-guide/1-installation/index.md
+++ b/docs/guide/2-user-guide/1-installation/index.md
@@ -4,27 +4,33 @@ title: Install
 
 ## Prerequisites
 
-You absolutely need:
+You need:
 
 * [CMake](https://cmake.org/) (3.12+)
 * GNU make or Ninja
-* a Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings)
-* a BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.)
-* a Python version installed (2.7 or 3.6+ have been tested)
+* Fortran compiler which supports at least Fortran 2008 (including the TS 29113 when using the C-bindings)
+* BLAS+LAPACK implementation (reference, OpenBLAS and MKL have been tested. Note: DBCSR linked to OpenBLAS 0.3.6 gives wrong results on Power9 architectures.)
+* Python version installed (2.7 or 3.6+ have been tested)
 
-Optionally:
+Optional:
 
-* [libxsmm](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration
-* a LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests
+* [LIBXSMM](https://github.com/hfp/libxsmm) (1.10+, and `pkg-config`) for Small Matrix Multiplication acceleration
+* LAPACK implementation (reference, OpenBLAS-bundled and MKL have been tested), required when building the tests
 
-To build `libsmm_acc`, DBCSR's GPU backend, you further need:
+To build DBCSR's GPU backend:
 
-* A GPU-capable compiler, either
-  * CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS
-  * or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS (the tested version is ROCm 3.8)
-* a C++ compiler which supports at least C++11 standard
+* CUDA Toolkit (targets NVIDIA GPUs, minimal version required: 5.5) with cuBLAS
+    * Host C++ compiler which supports at least C++11 standard
+* or HIP compiler (targets NVIDIA or AMD GPUs) and hipBLAS (ROCm 3.8 was tested)
+    * Host C++ compiler which supports at least C++11 standard
+* or OpenCL, i.e., development headers (`opencl-headers`), generic loader "ocl-icd" (`ocl-icd-opencl-dev`),
+    * Vendor specific OpenCL package, e.g., [Intel Compute Runtime](https://github.com/intel/compute-runtime/releases/latest),
+      or CUDA Toolkit (includes OpenCL)
+    * For the OpenCL backend, a plain C compiler is sufficient (C90 standard),
+    * Optionally `clinfo` (can be useful to show available devices)
 
-We test against GNU and Intel compilers on Linux systems, GNU compiler on MacOS systems. See a list of supported compilers [here](./3-supported-compilers.html).
+DBCSR is tested against GNU and Intel compilers on Linux systems, and GNU compiler on MacOS systems.
+See a list of supported compilers [here](./3-supported-compilers.html).
 
 ## Get DBCSR
 
@@ -36,12 +42,12 @@ git clone --recursive https://github.com/cp2k/dbcsr.git
 
 ## Build
 
-DBCSR can be compiled in 4 main variants:
-* Serial, i.e. no OpenMP and MPI
+DBCSR can be compiled in four main variants:
+* Serial, i.e., no OpenMP and no MPI
 * OpenMP
 * MPI
 * OpenMP+MPI
-The 4 variants can be combined with the accelerator support.
+In addition, the variants can support accelerators.
 
 Run inside the `dbcsr` directory:
 
@@ -58,9 +64,8 @@ make
 -DUSE_MPI=<ON|OFF>
 -DUSE_OPENMP=<ON|OFF>
 -DUSE_SMM=<blas|libxsmm>
--DUSE_CUDA=<OFF|ON>
+-DUSE_ACCEL=<opencl|cuda|hip>
 -DWITH_CUDA_PROFILING=<OFF|ON>
--DUSE_HIP=<OFF|ON>
 -DWITH_C_API=<ON|OFF>
 -DWITH_EXAMPLES=<ON|OFF>
 -DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50>
@@ -70,8 +75,8 @@ make
 -DTEST_OMP_THREADS=<2,N>
 ```
 
-When providing a custom build of `libxsmm`, make sure that its library directory is added to the `PKG_CONFIG_PATH` variable prior
-to running `cmake`. An example if `libxsmm` was checked out using Git to your home folder:
+When providing a build of LIBXSMM, make sure the `lib` directory is added to the `PKG_CONFIG_PATH` variable prior
+to running `cmake`. For example, if LIBXSMM was checked out using Git to your home folder:
 
 ```bash
 export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${HOME}/libxsmm/lib"
@@ -83,11 +88,7 @@ For build recipes on different platforms, make sure to also read the [CMake Buil
 
 ### Using Python in a virtual environment
 
-If you want to use Python from a virtual environment and your CMake version is < 3.15, specify the desired python interpreter manually using:
-
-```
-    -DPython_EXECUTABLE=/path/to/python
-```
+If Python is desired from a virtual environment and the CMake version below v3.15, then the python interpreter shall be specified manually using `cmake -DPython_EXECUTABLE=/path/to/python`.
 
 ### C/C++ Interface
 
@@ -98,7 +99,7 @@ If MPI support is enabled (the default), the C API is automatically built.
 HIP is a relatively new language, and some issues still need to be ironed out. As a workaround to an [issue](https://github.com/ROCm-Developer-Tools/HIP/pull/1543) in HIP's JIT infrastructure, please set the following if you've built HIP from source:
 
 ```bash
-    export HIP_PATH=/opt/rocm/hip
+export HIP_PATH=/opt/rocm/hip
 ```
 
 before running on an AMD GPU.
diff --git a/docs/guide/3-developer-guide/3-programming/1-overview/index.md b/docs/guide/3-developer-guide/3-programming/1-overview/index.md
index 171905f53e1..0577d60a4e8 100644
--- a/docs/guide/3-developer-guide/3-programming/1-overview/index.md
+++ b/docs/guide/3-developer-guide/3-programming/1-overview/index.md
@@ -51,10 +51,7 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid
 | `NDEBUG`       | Assertions are stripped ("compiled out"), `NDEBUG` is the ANSI-conforming symbol name (not `__NDEBUG`). Regular release builds may carry assertions for safety | Fortran, C, C++ |
 | `__CRAY_PM_ACCEL_ENERGY` or `__CRAY_PM_ENERGY` | Switch on collectin energy profiling on Cray systems | Fortran |
 | `__DBCSR_ACC` | Enable Accelerator compilation | Fortran, C, C++ |
+| `__OPENCL`  | Enable OpenCL acceleration | C |
 | `__CUDA_PROFILING`  | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ |
 | `__CUDA` | Enable CUDA acceleration | C, C++ |
 | `__HIP`  | Enable HIP acceleration | C, C++ |
-
-
-
-
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/1-code-structure.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/1-code-structure.md
index f70d93b1397..a6668a2e6fd 100644
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/1-code-structure.md
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/1-code-structure.md
@@ -5,11 +5,10 @@ title: Code Structure
 ```
 dbcsr/
 -- src/
----- acc/: contains all code related to accelerators
------- include/: contains interfaces to acc and acc_libsmm
------- cuda/: cuda interface
------- hip/: hip interface
------- openmp/ (PR #260): openmp offloading interface
------- libsmm_acc/: small matrix-matrix operations implementation on GPU (can use either cuda or hip interface)
------- libsmm_omp/ (PR #260): small matrix-matrix operations implementation on GPU (uses necessarily the openmp interface)
+---- acc/: contains interfaces to ACC and LIBSMM (top-level) as well as backends (subdirectories)
+------ cuda/: CUDA backend
+------ hip/: HIP backend
+------ libsmm_acc/: small matrix-matrix operations on GPU (can use either cuda or hip interface)
+------ opencl/: OpenCL backend
+------ opencl/smm/: LIBSMM implementation based on OpenCL
 ```
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/index.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/index.md
index e7007919ae3..d16170b5c03 100644
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/index.md
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/index.md
@@ -1,3 +1,3 @@
-title: libsmm_acc
+title: LIBSMM (CUDA/HIP)
 
 {!./src/acc/libsmm_acc/README.md!}
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/3-opencl-backend.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/3-opencl-backend.md
new file mode 100644
index 00000000000..8965c8f435b
--- /dev/null
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/3-opencl-backend.md
@@ -0,0 +1,3 @@
+title: OpenCL Backend
+
+{!./src/acc/opencl/README.md!}
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/4-opencl-libsmm.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/4-opencl-libsmm.md
new file mode 100644
index 00000000000..abf96990301
--- /dev/null
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/4-opencl-libsmm.md
@@ -0,0 +1,3 @@
+title: OpenCL LIBSMM
+
+{!./src/acc/opencl/smm/README.md!}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 167516e9516..6b7fdbadf3c 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,7 +6,7 @@ set(DBCSR_PROGRAM_SRCS_CPP dbcsr_example_3.cpp dbcsr_tensor_example_2.cpp)
 # Compile Fortran examples
 foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_FTN})
   get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE)
-  if (USE_HIP)
+  if (USE_ACCEL MATCHES "hip")
     hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src})
   else ()
     add_executable(${dbcsr_program_name} ${dbcsr_program_src})
@@ -24,7 +24,7 @@ if (WITH_C_API)
   foreach (dbcsr_program_src ${DBCSR_PROGRAM_SRCS_CPP})
     get_filename_component(dbcsr_program_name ${dbcsr_program_src} NAME_WE)
     set(dbcsr_program_name ${dbcsr_program_name}_cpp)
-    if (USE_HIP)
+    if (USE_ACCEL MATCHES "hip")
       hip_add_executable(${dbcsr_program_name} ${dbcsr_program_src})
     else ()
       add_executable(${dbcsr_program_name} ${dbcsr_program_src})
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e73b89d74af..f75ebc7da9b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -105,6 +105,10 @@ add_fypp_sources(
   utils/dbcsr_toollib.F
   work/dbcsr_work_operations.F)
 
+set(DBCSR_OPENCL_SRCS
+    acc/opencl/acc_opencl.c acc/opencl/acc_opencl_event.c
+    acc/opencl/acc_opencl_mem.c acc/opencl/acc_opencl_stream.c)
+
 set(DBCSR_CUDA_SRCS
     acc/cuda/acc_cublas.cu
     acc/cuda/acc_cuda.cpp
@@ -141,13 +145,9 @@ add_library(dbcsr ${DBCSR_SRCS})
 set_target_properties(dbcsr PROPERTIES VERSION ${dbcsr_VERSION}
                                        SOVERSION ${dbcsr_APIVERSION})
 
-if (TARGET PkgConfig::deps)
-  target_link_libraries(dbcsr PRIVATE PkgConfig::deps)
-endif ()
-
-if (USE_SMM MATCHES "libxsmm")
-  # linker/include flags are managed by pkg-config (above)
+if (LIBXSMM_FOUND)
   target_compile_definitions(dbcsr PRIVATE __LIBXSMM)
+  target_link_libraries(dbcsr PRIVATE PkgConfig::LIBXSMM)
 endif ()
 
 if (BLAS_LIBRARIES MATCHES "mkl_")
@@ -203,6 +203,25 @@ if (OpenMP_FOUND)
   target_link_libraries(dbcsr PRIVATE OpenMP::OpenMP_Fortran)
 endif ()
 
+# =================================================================================================
+# DBCSR LIBRARY's OPENCL BACKEND
+
+if (USE_ACCEL MATCHES "opencl")
+  target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
+  target_link_libraries(dbcsr PRIVATE ${OpenCL_LIBRARY})
+
+  # OpenCL backend
+  set(DBCSR_ACC_SRCS ${DBCSR_OPENCL_SRCS})
+  add_library(acc OBJECT ${DBCSR_ACC_SRCS})
+  target_compile_definitions(acc PRIVATE __OPENCL)
+  # account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init)
+  target_compile_definitions(acc PRIVATE __DBCSR_ACC)
+  target_include_directories(acc PRIVATE ${OpenCL_INCLUDE_DIRS})
+  target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:acc>)
+  add_subdirectory(acc/opencl/smm)
+  target_sources(dbcsr PRIVATE $<TARGET_OBJECTS:libsmm_acc>)
+endif ()
+
 # =================================================================================================
 # DBCSR LIBRARY's CUDA BACKEND
 
@@ -240,7 +259,7 @@ function (CUDA_CONVERT_FLAGS EXISTING_TARGET)
   )
 endfunction ()
 
-if (USE_CUDA)
+if (USE_ACCEL MATCHES "cuda")
   if (${CMAKE_VERSION} VERSION_LESS 3.16)
     # workaround for CUDA support with CMake <3.16, see also see
     # https://gitlab.kitware.com/cmake/cmake/issues/17929 and
@@ -296,8 +315,7 @@ endif ()
 # =================================================================================================
 # DBCSR LIBRARY's HIP BACKEND
 
-if (USE_HIP)
-
+if (USE_ACCEL MATCHES "hip")
   if (USE_OPENMP)
     set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${OpenMP_CXX_FLAGS}")
   endif ()
@@ -335,7 +353,6 @@ if (USE_HIP)
 
   target_compile_definitions(dbcsr PRIVATE __DBCSR_ACC)
   target_compile_definitions(dbcsr PRIVATE __HIP)
-
 endif ()
 
 # =================================================================================================
@@ -401,7 +418,7 @@ write_basic_package_version_file(
   "${CMAKE_CURRENT_BINARY_DIR}/DBCSRConfigVersion.cmake"
   VERSION "${dbcsr_VERSION}"
   COMPATIBILITY SameMajorVersion)
-if (USE_HIP)
+if (USE_ACCEL MATCHES "hip")
   install(
     EXPORT libsmm_accTargets
     NAMESPACE "${config_namespace}"
diff --git a/src/acc/PACKAGE b/src/acc/PACKAGE
index c75a1ba4006..9b1679e7c76 100644
--- a/src/acc/PACKAGE
+++ b/src/acc/PACKAGE
@@ -1,5 +1,5 @@
 {
 "description": "Generic accelerator API",
 "archive": "libdbcsr",
-"requires": ["../base", "cuda", "hip", "libsmm_acc"]
+"requires": ["../base", "cuda", "hip", "opencl", "libsmm_acc"]
 }
diff --git a/src/acc/README.md b/src/acc/README.md
index 3355b527109..48ecbbda23e 100644
--- a/src/acc/README.md
+++ b/src/acc/README.md
@@ -2,28 +2,51 @@
 
 ## Overview
 
-This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). It also contains the CUDA (for Nvidia GPUs) and HIP (for AMD GPUs) accelerator backends.
+This folder contains the ISO_C_BINDING based Fortran code of DBCSR's [ACC-backend interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h) and [LIBSMM/ACC-interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h). It also contains the CUDA (for Nvidia GPUs), the HIP (for AMD GPUs), and the OpenCL accelerator backends.
 
 Further, two stand-alone sample codes are given exercising both interfaces (benchmarks).
 
 ## CUDA and HIP backends
 
-The code for both the CUDA and HIP backends is unique, and can be found in the `cuda` directory.
-We switch from one backend to the other via macros (`__CUDA` and `__HIP`).
+The code for both the CUDA and HIP backends is unified, and can be found in the `cuda` directory.
+At compile-time either one or the other backend is chosen per macro (`__CUDA` or `__HIP`).
+
+## OpenCL backend
+
+The code for both the OpenCL backends is enabled with a build-time macro (`__OPENCL`).
 
 ## Benchmarks
 
-Two stand-alone drivers (only depending on above mentioned interfaces) can be built locally and in a rather self-contained fashion, i.e., no DBCSR library is needed (except for runtime libraries such as OpenCL, and LIBXSMM for some auxiliary functionality). For LIBXSMM, a folder `libxsmm` parallel to DBCSR's root directory (`dbcsr`) is expected to be present and prebuilt (`make` in LIBXSMM's root directory is enough). To build the driver code, change into the respective backend folder (`cuda` or `opencl`), and invoke `make` (`DBG=0|1|2`, and a few other key-value pairs are optional). When building the code is completed, change back into the parent folder and invoke either `acc_bench_trans` or `acc_bench_smm`.
+Two stand-alone drivers (only depending on above mentioned interfaces) can be built locally and in a rather self-contained fashion, i.e., no DBCSR library is needed (except runtime libraries such as CUDA, HIP, OpenCL/LIBXSMM). For OpenCL, a folder `libxsmm` parallel to DBCSR's root directory (`dbcsr`) is expected to be present and prebuilt (`make` in LIBXSMM's root directory is enough). To build the driver code, change into the respective backend folder (`cuda` or `opencl`), and invoke `make` (`DBG=0|1|2`, and a few other key-value pairs are optional). When building the code is completed, change back into the parent folder and invoke either `acc_bench_trans` or `acc_bench_smm`.
 
-The drivers support a few command line options (_nrepeat_, _stack_size_, _m_, _n_, ...); running the tranpose benchmark may look like:
+The drivers support a few command line options (_nrepeat_, _stack_size_, _m_, _n_, ...). Command line arguments are positional but allow `0` as placeholder to access the default value (`acc_bench_smm 0 0 5 13 5` performs the default number of repetitions with the default stacksize when running the 5x13x5-kernel). For example, running the tranpose benchmark may look like:
 
 ```bash
 $ OMP_PROC_BIND=TRUE ./acc_bench_trans 5 30000 23 23
 ./acc_bench_trans 5 30000 23 23
-copy-in: 16.8 ms 7.4 GB/s
+typename (id=3): double
+copy-in: 17.2 ms 7.2 GB/s
 device: 8.7 ms 14.2 GB/s
-host: 8.5 ms 14.5 GB/s
+host: 8.4 ms 14.6 GB/s
 errors: 0
 ```
 
-For timing, comparison (host code), and validation, LIBXSMM is expected. The drivers exercise the respective backend as chosen to build the code.
+For timing, comparison (host code), and validation, LIBXSMM is required. The drivers exercise the respective backend. For example with the CUDA backend:
+
+```bash
+cd cuda
+make DBG=0 WITH_GPU=P100
+cd ..
+```
+
+For the OpenCL backend:
+
+```bash
+cd opencl
+make DBG=0
+cd ..
+```
+
+In either of the above cases, `acc_bench_trans` and `acc_bench_smm` are built using the respective backends.
+Both driver codes can be instantiated for at least double- and single-precision using a build-time macro (`ELEM_TYPE`).
+Several build-time settings can be made on the build-line (`-D`) or inside of the source files (`acc_bench_trans.c` or `acc_bench_smm.c`).
diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c
index 1802010d7da..25a750ae845 100644
--- a/src/acc/acc_bench_smm.c
+++ b/src/acc/acc_bench_smm.c
@@ -11,6 +11,7 @@
 #include <string.h>
 #include <assert.h>
 #include <stdio.h>
+#include <math.h>
 
 #if defined(__LIBXSMM)
 # include <libxsmm.h>
@@ -20,15 +21,18 @@
 #if !defined(ELEM_TYPE)
 # define ELEM_TYPE double
 #endif
-#if !defined(EPSILON)
-# define EPSILON 1E-3
-#endif
 #if !defined(MAX_KERNEL_DIM)
 # define MAX_KERNEL_DIM 80
 #endif
 #if !defined(ALIGNMENT)
 # define ALIGNMENT 64
 #endif
+#if !defined(TRANSPOSE)
+# define TRANSPOSE 1
+#endif
+#if !defined(VALIDATE)
+# define VALIDATE 1
+#endif
 #if !defined(WARMUP)
 # define WARMUP 2
 #endif
@@ -40,11 +44,11 @@
   EXIT_SUCCESS != (NULL != ((const void*)(RPTR)) ? (*((int*)(RPTR)) = (EXPR)) : (EXPR))) assert(0)
 
 
-#if defined(_DEBUG) && defined(USE_LIBXSMM)
+#if defined(_DEBUG) && defined(USE_LIBXSMM) && defined(VALIDATE) && (0 != VALIDATE)
 static void print(FILE* ostream, const char* label, const ELEM_TYPE* mat, int m, int n);
 #endif
 
-static void init(int seed, ELEM_TYPE* dst, int m, int n);
+static void init(int seed, ELEM_TYPE* dst, int m, int n, double scale);
 /* for comparison, adopt artificial stack-setup from other DBCSR/ACC benchmarks */
 static void init_stack(int* stack, int stack_size,
   int mn, int mk, int kn, int nc, int na, int nb);
@@ -52,14 +56,22 @@ static void init_stack(int* stack, int stack_size,
 
 int main(int argc, char* argv[])
 {
-  const int nrepeat = (1 < argc ? atoi(argv[1]) : 5);
-  const int stack_size = (2 < argc ? atoi(argv[2]) : 30000);
-  const int m = (3 < argc ? atoi(argv[3]) : 23);
-  const int n = (4 < argc ? atoi(argv[4]) : m);
-  const int k = (5 < argc ? atoi(argv[5]) : m);
-  const int nc = (6 < argc ? MIN(atoi(argv[6]), stack_size) : MAX(stack_size / 16, 1));
-  const int na = (7 < argc ? atoi(argv[7]) : (10 * nc));
-  const int nb = (8 < argc ? atoi(argv[8]) : (10 * nc));
+  const int inr = (1 < argc ? atoi(argv[1]) : 0);
+  const int iss = (2 < argc ? atoi(argv[2]) : 0);
+  const int ism = (3 < argc ? atoi(argv[3]) : 0);
+  const int isn = (4 < argc ? atoi(argv[4]) : 0);
+  const int isk = (5 < argc ? atoi(argv[5]) : 0);
+  const int inc = (6 < argc ? atoi(argv[6]) : 0);
+  const int ina = (7 < argc ? atoi(argv[7]) : 0);
+  const int inb = (8 < argc ? atoi(argv[8]) : 0);
+  const int nrepeat = (0 < inr ? inr : 3);
+  const int stack_size = (0 < iss ? iss : 30000);
+  const int m = (0 < ism ? ism : 23);
+  const int n = (0 < isn ? isn : m);
+  const int k = (0 < isk ? isk : m);
+  const int nc = (0 < inc ? MIN(inc, stack_size) : MAX(stack_size / 16, 1));
+  const int na = (0 < ina ? ina : (10 * nc));
+  const int nb = (0 < inb ? inb : (10 * nc));
 #if defined(ALIGNMENT) && (0 < ALIGNMENT)
   const int ma = (int)ROUNDUP2(sizeof(ELEM_TYPE) * m, ALIGNMENT);
   const int ka = (int)ROUNDUP2(sizeof(ELEM_TYPE) * k, ALIGNMENT);
@@ -73,6 +85,10 @@ int main(int argc, char* argv[])
   const int warmup = MAX(WARMUP, 2) / 2 * 2;
 #else
   const int warmup = 0;
+#endif
+#if defined(VALIDATE) && (0 != VALIDATE)
+  const char *const env_check = getenv("CHECK");
+  const double check = (NULL == env_check ? -1 : fabs(atof(env_check)));
 #endif
   int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL;
   ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL;
@@ -81,12 +97,17 @@ int main(int argc, char* argv[])
   void *stream = NULL;
 #if defined(USE_LIBXSMM)
   libxsmm_timer_tickint start;
-  double duration, transpose;
+# if defined(TRANSPOSE) && (0 != TRANSPOSE) && defined(VALIDATE) && (0 != VALIDATE)
+  double transpose;
+# endif
+  double duration;
 #endif
   assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n));
   printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "",
     nrepeat, stack_size, m, n, k, nc, na, nb);
   CHECK(acc_init(), &result);
+  /* note: libsmm_acc_init() may imply acc_init() */
+  CHECK(libsmm_acc_init(), &result);
   CHECK(acc_get_ndevices(&ndevices), &result);
   if (0 < ndevices) {
 #if defined(_DEBUG)
@@ -96,11 +117,14 @@ int main(int argc, char* argv[])
   else {
 #if defined(_DEBUG)
     fprintf(stderr, "Error: no device found!\n");
+#endif
+#if !defined(__CUDA)
+    CHECK(libsmm_acc_finalize(), NULL);
 #endif
     CHECK(acc_finalize(), NULL);
     return result;
   }
-  printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
+  printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
   CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
   CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result);
   CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
@@ -110,10 +134,10 @@ int main(int argc, char* argv[])
   CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
   /* initialize matrices */
   for (i = 0; i < na; ++i) {
-    init(i/*seed*/ + 42, &amat_hst[i*mk], m, k);
+    init(i/*seed*/ + 42, &amat_hst[i*mk], m, k, 1.0 / (nc * na));
   }
   for (i = 0; i < nb; ++i) {
-    init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n);
+    init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n, 1.0 / (nc * nb));
     trans_hst[i] = i * kn;
   }
   init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb);
@@ -138,6 +162,7 @@ int main(int argc, char* argv[])
     (sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3)
       * stack_size / (duration * (1ULL << 30)));
 #endif
+#if defined(TRANSPOSE) && (0 != TRANSPOSE) && defined(VALIDATE) && (0 != VALIDATE)
   /* warmup execution and prebuild transpose-kernel */
   for (r = 0; r < warmup / 2; ++r) {
     CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
@@ -145,16 +170,17 @@ int main(int argc, char* argv[])
     CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
       DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result);
   }
-#if defined(USE_LIBXSMM)
+# if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
-#endif
+# endif
   /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
   CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
     DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
-#if defined(USE_LIBXSMM)
+# if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   transpose = libxsmm_timer_duration(start, libxsmm_timer_tick());
+# endif
 #endif
   /* warmup execution and prebuild SMM-kernel */
   for (r = 0; r < warmup; ++r) {
@@ -174,12 +200,18 @@ int main(int argc, char* argv[])
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
-  if (EXIT_SUCCESS == result) {
+# if defined(VALIDATE) && (0 != VALIDATE)
+  if (0 != check && EXIT_SUCCESS == result) {
     ELEM_TYPE *const gold_hst = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc);
-    const char transa = 'N', transb = 'N';
     const ELEM_TYPE alpha = 1, beta = 1;
+    const char transa = 'N';
+#   if !defined(TRANSPOSE) || (0 == TRANSPOSE)
+    const char transb = 'T';
+#   else
+    const char transb = 'N';
     printf("transpose: %.1f ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / nrepeat,
       ((size_t)2 * m * n * k) * stack_size / ((duration + transpose) * (1ULL << 30) / nrepeat));
+#   endif
     printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
       ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
     memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
@@ -221,22 +253,28 @@ int main(int argc, char* argv[])
           }
         }
         if (0 < diff) {
-# if defined(_DEBUG)
+#   if defined(_DEBUG)
           print(stderr, "gold = ", gold, m, n);
           print(stderr, "test = ", test, m, n);
           fprintf(stderr, "diff = %g (%g != %g)\n", diff, a, b);
-# endif
+#   endif
           if (abserror < diff) {
             relerror = fabs(0 != a ? (diff / a) : (diff / b));
             abserror = diff;
           }
         }
       }
-      printf("max.error: rel=%g\n", relerror);
-      if (EPSILON < relerror) result = EXIT_FAILURE;
+      printf("max.error: abs=%g rel=%g\n", abserror, relerror);
+      if (0 < check && check < relerror) result = EXIT_FAILURE;
     }
     libxsmm_free(gold_hst);
   }
+  else
+# endif
+  if (EXIT_SUCCESS == result) {
+    printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
+      ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
+  }
 #endif
   CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL);
   CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL);
@@ -249,6 +287,9 @@ int main(int argc, char* argv[])
   CHECK(acc_dev_mem_deallocate(bmat_dev), NULL);
   CHECK(acc_dev_mem_deallocate(cmat_dev), NULL);
   CHECK(acc_stream_destroy(stream), NULL);
+#if !defined(__CUDA)
+  CHECK(libsmm_acc_finalize(), NULL);
+#endif
   CHECK(acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
@@ -257,12 +298,13 @@ int main(int argc, char* argv[])
 }
 
 
-static void init(int seed, ELEM_TYPE* dst, int m, int n) {
+static void init(int seed, ELEM_TYPE* dst, int m, int n, double scale) {
+  const double seed1 = scale * seed + scale;
   int i, j;
   for (i = 0; i < n; ++i) {
     for (j = 0; j < m; ++j) {
       const int k = i * m + j;
-      dst[k] = (ELEM_TYPE)((seed + 1) * (k + 1));
+      dst[k] = (ELEM_TYPE)(seed1 * (k + 1));
     }
   }
 }
@@ -291,7 +333,7 @@ static void init_stack(int* stack, int stack_size,
 }
 
 
-#if defined(_DEBUG) && defined(USE_LIBXSMM)
+#if defined(_DEBUG) && defined(USE_LIBXSMM) && defined(VALIDATE) && (0 != VALIDATE)
 static void print(FILE* ostream, const char* label, const ELEM_TYPE* mat, int m, int n)
 {
   int i, j;
diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c
index 000150436a3..ccf4288ba82 100644
--- a/src/acc/acc_bench_trans.c
+++ b/src/acc/acc_bench_trans.c
@@ -52,18 +52,25 @@ static void swap(int* m, int* n) { int tmp = *m; *m = *n; *n = tmp; }
 
 int main(int argc, char* argv[])
 {
-  const int nrepeat = (1 < argc ? atoi(argv[1]) : 5), offset = 0;
+  const int inr = (1 < argc ? atoi(argv[1]) : 0);
+  const int iss = (2 < argc ? atoi(argv[2]) : 0);
+  const int ism = (3 < argc ? atoi(argv[3]) : 0);
+  const int isn = (4 < argc ? atoi(argv[4]) : 0);
+  const int iof = (5 < argc ? atoi(argv[5]) : 0);
+  const int nrepeat = (0 < inr ? inr : 5);
   const int nodd = (0 < nrepeat ? ((nrepeat & 1/*odd*/) ? nrepeat : (nrepeat - 1)) : 1);
-  const int stack_size = (2 < argc ? atoi(argv[2]) : 30000);
-  const int m = (3 < argc ? atoi(argv[3]) : 23);
-  const int n = (4 < argc ? atoi(argv[4]) : m);
+  const int stack_size = (0 < iss ? iss : 30000);
+  const int m = (0 < ism ? ism : 23);
+  const int n = (0 < isn ? isn : m);
+  const int offset = (0 < iof ? iof : 0);
+  const int offset_stack_size = offset + stack_size;
 #if defined(ALIGNMENT) && (0 < ALIGNMENT)
   const int mn = (int)ROUNDUP2(sizeof(ELEM_TYPE) * m, ALIGNMENT) * n / sizeof(ELEM_TYPE);
 #else
   const int mn = m * n;
 #endif
 #if defined(SHUFFLE)
-  const size_t shuffle = libxsmm_shuffle((unsigned int)stack_size);
+  const size_t shuffle = libxsmm_shuffle((unsigned int)offset_stack_size);
 #endif
 #if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
   const int warmup = MAX(WARMUP, 2) / 2 * 2;
@@ -84,6 +91,8 @@ int main(int argc, char* argv[])
   assert(m <= (mn / n) && 0 == (mn % n));
   printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
   CHECK(acc_init(), &result);
+  /* note: libsmm_acc_init() may imply acc_init() */
+  CHECK(libsmm_acc_init(), &result);
   CHECK(acc_get_ndevices(&ndevices), &result);
   if (0 < ndevices) {
 #if defined(_DEBUG)
@@ -93,45 +102,48 @@ int main(int argc, char* argv[])
   else {
 #if defined(_DEBUG)
     fprintf(stderr, "Error: no device found!\n");
+#endif
+#if !defined(__CUDA)
+    CHECK(libsmm_acc_finalize(), NULL);
 #endif
     CHECK(acc_finalize(), NULL);
     return result;
   }
-  printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
+  printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
 #if defined(PRIORITY)
   CHECK(acc_stream_priority_range(&priomin, &priomax), &result);
   CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result);
 #else
   CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
 #endif
-  CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
-  CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&mat_hst, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * offset_stack_size, stream), &result);
   CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
-  for (i = 0; i < stack_size; ++i) { /* initialize matrices */
+  for (i = 0; i < offset_stack_size; ++i) { /* initialize matrices */
     init(i/*seed*/, &mat_hst[i*mn], m, n);
   }
-  for (i = 0; i < stack_size; ++i) { /* initialize indexes */
+  for (i = 0; i < offset_stack_size; ++i) { /* initialize indexes */
 #if defined(SHUFFLE)
-    const int j = mn * (int)((shuffle * i) % stack_size);
+    const int j = mn * (int)((shuffle * i) % offset_stack_size);
 #else
     const int j = mn * i;
 #endif
     stack_hst[i] = j;
   }
-  CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
-  CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * stack_size), &result);
+  CHECK(acc_dev_mem_allocate((void**)&mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size), &result);
+  CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * offset_stack_size), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
 #endif
-  CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
-  CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * stack_size, stream), &result);
+  CHECK(acc_memcpy_h2d(mat_hst, mat_dev, sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
+  CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * offset_stack_size, stream), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
   printf("copy-in: %.1f ms %.1f GB/s\n", 1000.0 * duration,
     (sizeof(ELEM_TYPE) * mn + sizeof(int))
-      * stack_size / (duration * (1ULL << 30)));
+      * offset_stack_size / (duration * (1ULL << 30)));
 #endif
   /* warmup execution and prebuild JIT kernels */
   for (r = 0; r < warmup / 2; ++r) {
@@ -156,25 +168,25 @@ int main(int argc, char* argv[])
     assert(0 < nodd && (nodd & 1/*odd*/));
     printf("device: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
       (sizeof(ELEM_TYPE) * mn + sizeof(int))
-        * stack_size / (duration * (1ULL << 30) / nodd));
+        * offset_stack_size / (duration * (1ULL << 30) / nodd));
     mm = m; nn = n;
     start = libxsmm_timer_tick();
     for (r = 0; r < nodd; ++r) {
       libxsmm_itrans_batch_omp(mat_hst, sizeof(ELEM_TYPE), mm, nn, mm, nn,
-        0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst, stack_size);
+        0/*index_base*/, sizeof(int)/*index_stride*/, stack_hst + offset, stack_size);
       swap(&mm, &nn);
     }
     duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
     printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
       (sizeof(ELEM_TYPE) * mn + sizeof(int))
-        * stack_size / (duration * (1ULL << 30) / nodd));
+        * offset_stack_size / (duration * (1ULL << 30) / nodd));
     /* transfer result from device to host for validation */
     CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
-      sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
+      sizeof(ELEM_TYPE) * mn * offset_stack_size, stream), &result);
     CHECK(acc_stream_sync(stream), &result);
     if (EXIT_SUCCESS == result) {
       unsigned int nerrors = 0;
-      for (i = 0; i < stack_size; ++i) {
+      for (i = offset; i < offset_stack_size; ++i) {
         ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
         const ELEM_TYPE *const test = mat_hst + mn * i;
         init(i/*seed*/, gold, m, n);
@@ -203,6 +215,9 @@ int main(int argc, char* argv[])
   CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
   CHECK(acc_dev_mem_deallocate(mat_dev), NULL);
   CHECK(acc_stream_destroy(stream), NULL);
+#if !defined(__CUDA)
+  CHECK(libsmm_acc_finalize(), NULL);
+#endif
   CHECK(acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile
index 5f3f1c90b9c..3f938a1066a 100644
--- a/src/acc/cuda/Makefile
+++ b/src/acc/cuda/Makefile
@@ -14,9 +14,9 @@ INCALL := $(INCACC) $(INCSMM)
 
 LIBXSMMROOT ?= $(wildcard ../../../../libxsmm)
 NVCC ?= $(shell which nvcc 2>/dev/null)
-CUDA_PATH ?= $(if $(NVCC),$(abspath $(dir $($(NVCC)))/..))
+CUDA_PATH ?= $(if $(NVCC),$(abspath $(dir $(NVCC))/..))
 UNAME := $(shell uname)
-WITH_GPU ?= V100
+WITH_GPU ?= P100
 INTEL ?= 0
 DEV ?= 0
 
@@ -99,6 +99,12 @@ ifneq (,$(LIBXSMMROOT))
   CFLAGS += -pthread -D__LIBXSMM -I$(LIBXSMMROOT)/include
 endif
 
+ifneq (,$(CUDA_PATH))
+  CFLAGS += -I$(CUDA_PATH)/include
+  LDFLAGS += -L$(CUDA_PATH)/lib64/stubs
+  LDFLAGS += -L$(CUDA_PATH)/lib64
+endif
+
 ifneq (0,$(DEV))
   CXXFLAGS := -std=c++11 $(CFLAGS)
   CFLAGS := -std=c89 $(CFLAGS)
@@ -106,7 +112,7 @@ else
   CXXFLAGS := $(CFLAGS)
 endif
 
-LDFLAGS += -L$(CUDA_PATH)/lib64 -lcudart -lcublas -lnvrtc -lcuda
+LDFLAGS += -lcudart -lcublas -lnvrtc -lcuda
 CFLAGS += -Wno-variadic-macros -Wno-long-long
 
 .PHONY: all
diff --git a/src/acc/libsmm_acc/CMakeLists.txt b/src/acc/libsmm_acc/CMakeLists.txt
index 789827dd970..71a73f284c2 100644
--- a/src/acc/libsmm_acc/CMakeLists.txt
+++ b/src/acc/libsmm_acc/CMakeLists.txt
@@ -36,7 +36,7 @@ add_custom_command(
   OUTPUT smm_acc_kernels.h
   COMMENT "libsmm_acc: generating kernels")
 
-if (USE_CUDA)
+if (USE_ACCEL MATCHES "cuda")
 
   add_library(libsmm_acc OBJECT ${LIBSMM_ACC_FILES})
   target_compile_definitions(libsmm_acc PRIVATE __CUDA)
@@ -44,7 +44,7 @@ if (USE_CUDA)
   target_compile_definitions(
     libsmm_acc PRIVATE $<$<BOOL:${WITH_CUDA_PROFILING}>:__CUDA_PROFILING>)
 
-else () # i.e. USE_HIP
+elseif (USE_ACCEL MATCHES "hip")
 
   set_source_files_properties(${LIBSMM_ACC_SRC_FILES}
                               PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
diff --git a/src/acc/libsmm_acc/README.md b/src/acc/libsmm_acc/README.md
index 44265c5582e..92b7772a5d3 100644
--- a/src/acc/libsmm_acc/README.md
+++ b/src/acc/libsmm_acc/README.md
@@ -1,4 +1,4 @@
-# libsmm_acc: GPU Accelerated Small Matrix Multiplications
+# GPU Accelerated Small Matrix Multiplications
 
 `libsmm_acc` is a **lib**rary for **s**mall **m**atrix-**m**atrix multiplication on a GPU-**acc**elerator. Stacks of matrix-matrix multiplication indices are passed from DBCSR to `libsmm_acc` which performs the multiplications on the GPU.
 
diff --git a/src/acc/opencl/Makefile b/src/acc/opencl/Makefile
new file mode 100644
index 00000000000..fce57d38f70
--- /dev/null
+++ b/src/acc/opencl/Makefile
@@ -0,0 +1,144 @@
+INCACC := $(wildcard *.h*) ../acc.h
+SRCACC := $(wildcard *.c)
+OBJACC := $(SRCACC:.c=.o)
+
+INCSMM := $(wildcard smm/*.h*) smm/opencl_kernels.h ../acc_libsmm.h
+SRCSMM := $(wildcard smm/*.c)
+OBJSMM := $(SRCSMM:.c=.o)
+KERNEL := $(wildcard smm/kernels/*.cl)
+
+INCALL := $(INCACC) $(INCSMM)
+
+LIBXSMMROOT ?= $(wildcard ../../../../libxsmm)
+UNAME := $(shell uname)
+INTEL ?= 0
+DEV ?= 0
+
+CFLAGS := -fPIC \
+  -Wall -Wextra -pedantic \
+  -Wno-overlength-strings \
+  -Wno-variadic-macros \
+  -Wno-long-long \
+  -D__OPENCL \
+  $(NULL)
+
+ifneq (0,$(DEV))
+  CFLAGS := -std=c89 $(CFLAGS)
+  CFLAGS += -Wno-unused-parameter
+else
+  CFLAGS := -std=c99 $(CFLAGS)
+endif
+
+ifeq (1,$(INTEL))
+  CXX := icpc
+  CC := icc
+  AR := xiar
+else ifneq (0,$(INTEL))
+  CXX := icpx
+  CC := icx
+  AR := xiar
+else
+  CXX := g++
+  CC := gcc
+  ifneq (Darwin,$(UNAME))
+    AR := gcc-ar
+  else
+    AR := ar
+  endif
+endif
+
+ifneq (0,$(DBG))
+  ifeq (,$(DBG))
+    CFLAGS += -O2
+  else
+    ifneq (1,$(DBG))
+      CFLAGS += -D_DEBUG
+    endif
+    CFLAGS += -O0
+  endif
+else
+  CFLAGS += -O2 -DNDEBUG
+  SYM := 0
+endif
+ifneq (0,$(SYM))
+  CFLAGS += -g
+endif
+
+ifneq (0,$(OMP))
+ifneq (0,$(INTEL))
+  CFLAGS += -qopenmp
+  LDFLAGS += -qopenmp
+else ifneq (Darwin,$(UNAME))
+  CFLAGS += -fopenmp
+  LDFLAGS += -fopenmp
+else # macOS
+  CFLAGS += -Xpreprocessor -fopenmp
+  LDFLAGS += -lomp
+endif
+endif
+
+ifneq (,$(LIBXSMMROOT))
+  LDFLAGS := -pthread $(LDFLAGS) -L$(LIBXSMMROOT)/lib -lxsmmext -lxsmm -lxsmmnoblas -ldl -lm
+  CFLAGS += -pthread -D__LIBXSMM -I$(LIBXSMMROOT)/include
+endif
+
+ifeq (Darwin,$(UNAME))
+  LDFLAGS += -framework OpenCL
+else
+  ifneq (,$(CUDATOOLKIT_HOME))
+    CFLAGS += -I$(CUDATOOLKIT_HOME)/include
+    LDFLAGS += -L$(CUDATOOLKIT_HOME)/lib64
+  else ifneq (,$(NVSDKCOMPUTE_ROOT))
+    CFLAGS += -I$(NVSDKCOMPUTE_ROOT)/include
+    LDFLAGS += -L$(NVSDKCOMPUTE_ROOT)/lib64
+  endif
+  LDFLAGS += -lOpenCL
+endif
+
+.PHONY: all
+all: ../dbcsr_acc.a ../dbcsr_acc_smm.a bench test
+
+.PHONY: bench
+bench: ../acc_bench_smm ../acc_bench_trans
+
+.PHONY: test
+test: ../dbcsr_acc_test
+
+smm/opencl_kernels.h: acc_opencl.sh $(KERNEL)
+	./acc_opencl.sh $(KERNEL) smm/tune_multiply.csv $@
+
+../dbcsr_acc.a: $(OBJACC)
+	$(AR) -rs $@ $^
+
+../dbcsr_acc_smm.a: $(OBJSMM)
+	$(AR) -rs $@ $^
+
+%.o: %.c $(INCALL) Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+
+acc_bench_smm.o: ../acc_bench_smm.c Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+../acc_bench_smm: acc_bench_smm.o ../dbcsr_acc_smm.a ../dbcsr_acc.a
+	$(CC) $^ $(LDFLAGS) -o $@
+
+acc_bench_trans.o: ../acc_bench_trans.c Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+../acc_bench_trans: acc_bench_trans.o ../dbcsr_acc_smm.a ../dbcsr_acc.a
+	$(CC) $^ $(LDFLAGS) -o $@
+
+dbcsr_acc_test.o: ../../../tests/dbcsr_acc_test.c Makefile
+	$(CC) $(CFLAGS) -c $< -o $@
+../dbcsr_acc_test: dbcsr_acc_test.o ../dbcsr_acc.a
+	$(CC) $^ $(LDFLAGS) -o $@
+
+.PHONY: clean
+clean:
+	@rm -f $(OBJACC) $(OBJSMM)
+	@rm -f acc_bench_smm.o acc_bench_trans.o dbcsr_acc_test.o
+	@rm -f smm/opencl_kernels.h
+
+.PHONY: realclean
+realclean: clean
+	@rm -f ../dbcsr_acc.a ../dbcsr_acc_smm.a
+	@rm -f ../acc_bench_smm ../acc_bench_trans
+	@rm -f ../dbcsr_acc_test
diff --git a/src/acc/opencl/PACKAGE b/src/acc/opencl/PACKAGE
new file mode 100644
index 00000000000..1c6d937d11b
--- /dev/null
+++ b/src/acc/opencl/PACKAGE
@@ -0,0 +1,5 @@
+{
+"description": "OpenCL backend for accelerator API",
+"archive": "libdbcsr",
+"requires": [".."]
+}
diff --git a/src/acc/opencl/README.md b/src/acc/opencl/README.md
new file mode 100644
index 00000000000..47791410edc
--- /dev/null
+++ b/src/acc/opencl/README.md
@@ -0,0 +1,30 @@
+# OpenCL Backend
+
+## Overview
+
+The OpenCL backend implements the [ACC interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc.h), which is also exposed in Fortran and used throughout DBCSR's code base to drive (GPU-)accelation based on ACC's device enumeration, data movement, and synchronization functionality. The customizations below below are typically for development, e.g., when attempting to contribute new functionality or features, or meant for debug purpose (and not necessarily settings to be made when using DBCSR or CP2K).
+
+## Customization
+
+### Compile-time Settings
+
+Compile-time settings are (implicitly) documented and can be adjusted by editing [acc_opencl.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/acc_opencl.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `ACC_OPENCL_STREAM_PRIORITIES` is enabled by default (and further confirmed at runtime/build-time) but can be disabled, or `ACC_OPENCL_VERBOSE` (which is disabled by default) can be enabled for debug purpose. More sensitive/private compile-time settings may be available within particular translation units like in `acc_opencl_mem.c`.
+
+An application of compile-time settings (and perhaps a valuable contribution) might be to call a GPU library in OpenCL-based LIBSMM. In such case, Shared Virtual Memory support (SVM) in OpenCL comes handy and can be enabled per `ACC_OPENCL_SVM`. The latter allows then to simply take the raw pointer out of an `cl_mem` object, and pass it into such library/function (which in turn can work across language borders, etc.).
+
+### Runtime Settings
+
+Runtime settings are made by the means of environment variables (implemented in `acc_opencl.c`). There are variables for chosing an OpenCL device:
+
+* `ACC_OPENCL_VENDOR`: character string matching the vendor of the OpenCL device in an case-insensitive fashion, e.g., "intel".
+* `ACC_OPENCL_DEVTYPE`: character string matching the device-kind like "cpu", "gpu", or another kind if neither CPU or GPU.
+* `ACC_OPENCL_DEVICE`: non-negative integer number to select a device from the (internally enumerated) list of devices.
+
+The OpenCL backend enumerates and orders devices primarily by device-kind (GPU, CPU, and others in that order) and by memory capacity (secondary criterion). Device IDs are zero-based as per ACC interface (and less than what is permitted/returned by `acc_get_ndevices`).
+
+Other runtime settings include:
+
+* `ACC_OPENCL_ASYNC_MEMOPS`: Boolean value (zero or non-zero integer) for asynchronous data movements.
+* `ACC_OPENCL_SVM`: Boolean value (zero or non-zero integer) for Shared Virtual Memory (SVM).
+
+Please note: some of the above runtime settings depend on compile-time settings in the first place in order to be effective.
diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
new file mode 100644
index 00000000000..9086047a029
--- /dev/null
+++ b/src/acc/opencl/acc_opencl.c
@@ -0,0 +1,597 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#if defined(__OPENCL)
+#include "acc_opencl.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <limits.h>
+#include <ctype.h>
+#if defined(_OPENMP)
+# include <omp.h>
+#endif
+#if defined(_WIN32)
+# include <windows.h>
+#else
+# include <glob.h>
+#endif
+#if defined(__DBCSR_ACC)
+# include "../acc_libsmm.h"
+#endif
+
+#if !defined(ACC_OPENCL_EXTLINE)
+# define ACC_OPENCL_EXTLINE
+#endif
+#if !defined(ACC_OPENCL_DELIMS)
+# define ACC_OPENCL_DELIMS " \t;,:"
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+acc_opencl_options_t acc_opencl_options;
+int acc_opencl_ndevices;
+cl_device_id acc_opencl_devices[ACC_OPENCL_DEVICES_MAXCOUNT];
+cl_context acc_opencl_context;
+
+#if !defined(NDEBUG)
+void acc_opencl_notify(const char* /*errinfo*/, const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/);
+void acc_opencl_notify(const char* errinfo, const void* private_info, size_t cb, void* user_data)
+{
+  ACC_OPENCL_UNUSED(private_info); ACC_OPENCL_UNUSED(cb); ACC_OPENCL_UNUSED(user_data);
+  fprintf(stderr, "ERROR ACC/OpenCL: %s\n", errinfo);
+}
+#endif
+
+
+const char* acc_opencl_stristr(const char* a, const char* b)
+{
+  const char* result = NULL;
+  if (NULL != a && NULL != b && '\0' != *a && '\0' != *b) {
+    do {
+      if (tolower(*a) != tolower(*b)) {
+        ++a;
+      }
+      else {
+        const char* c = b;
+        result = a;
+        while ('\0' != *++a && '\0' != *++c) {
+          if (tolower(*a) != tolower(*c)) {
+            result = NULL;
+            break;
+          }
+        }
+        if ('\0' != c[0] && '\0' != c[1]) {
+          result = NULL;
+        }
+        else break;
+      }
+    } while ('\0' != *a);
+  }
+  return result;
+}
+
+
+/* comparator used with qsort; stabilized by tail condition (a < b ? -1 : 1) */
+int acc_opencl_order_devices(const void* /*dev_a*/, const void* /*dev_b*/);
+int acc_opencl_order_devices(const void* dev_a, const void* dev_b)
+{
+  const cl_device_id *const a = (const cl_device_id*)dev_a;
+  const cl_device_id *const b = (const cl_device_id*)dev_b;
+  cl_device_type type_a, type_b;
+  assert(NULL != a && NULL != b && a != b);
+  ACC_OPENCL_EXPECT(EXIT_SUCCESS, clGetDeviceInfo(*a,
+    CL_DEVICE_TYPE, sizeof(cl_device_type), &type_a, NULL));
+  ACC_OPENCL_EXPECT(EXIT_SUCCESS, clGetDeviceInfo(*b,
+    CL_DEVICE_TYPE, sizeof(cl_device_type), &type_b, NULL));
+  if (CL_DEVICE_TYPE_DEFAULT & type_a) return -1;
+  else if (CL_DEVICE_TYPE_DEFAULT & type_b) return 1;
+  else {
+    if (CL_DEVICE_TYPE_GPU & type_a) {
+      if (CL_DEVICE_TYPE_GPU & type_b) {
+        size_t size_a, size_b;
+        ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a));
+        ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b));
+        return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1)));
+      }
+      else return -1;
+    }
+    else if (CL_DEVICE_TYPE_GPU & type_b) return 1;
+    else {
+      if (CL_DEVICE_TYPE_ACCELERATOR & type_a) {
+        if (CL_DEVICE_TYPE_ACCELERATOR & type_b) {
+          size_t size_a, size_b;
+          ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a));
+          ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b));
+          return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1)));
+        }
+        else return -1;
+      }
+      else if (CL_DEVICE_TYPE_ACCELERATOR & type_b) return 1;
+      else {
+        size_t size_a, size_b;
+        ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*a, NULL, &size_a));
+        ACC_OPENCL_EXPECT(EXIT_SUCCESS, acc_opencl_info_devmem(*b, NULL, &size_b));
+        return (size_a < size_b ? 1 : (size_a != size_b ? -1 : (a < b ? -1 : 1)));
+      }
+    }
+  }
+}
+
+
+int acc_init(void)
+{
+#if defined(_OPENMP)
+  /* initialization/finalization is not meant to be thread-safe */
+  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+#else
+  int result = EXIT_SUCCESS;
+#endif
+  if (0 == acc_opencl_ndevices) { /* avoid to initialize multiple times */
+    const char *const disable = getenv("ACC_OPENCL_DISABLE");
+    if (NULL == disable || '0' == *disable) {
+      cl_platform_id platforms[ACC_OPENCL_DEVICES_MAXCOUNT];
+      char buffer[ACC_OPENCL_BUFFERSIZE];
+      const char *const env_device_vendor = getenv("ACC_OPENCL_VENDOR");
+      const char *const env_device_type = getenv("ACC_OPENCL_DEVTYPE");
+      const char *const env_device_id = getenv("ACC_OPENCL_DEVICE");
+      int device_id = (NULL == env_device_id ? 0 : atoi(env_device_id));
+      cl_uint nplatforms = 0, ndevices = 0, i;
+      cl_device_type type = CL_DEVICE_TYPE_ALL;
+      ACC_OPENCL_CHECK(clGetPlatformIDs(0, NULL, &nplatforms),
+        "query number of platforms", result);
+      ACC_OPENCL_CHECK(clGetPlatformIDs(
+        nplatforms <= ACC_OPENCL_DEVICES_MAXCOUNT ? nplatforms : ACC_OPENCL_DEVICES_MAXCOUNT,
+        platforms, 0), "retrieve platform ids", result);
+      if (NULL != env_device_type && '\0' != *env_device_type) {
+        if (NULL != acc_opencl_stristr(env_device_type, "gpu")) type = CL_DEVICE_TYPE_GPU;
+        else if (NULL != acc_opencl_stristr(env_device_type, "cpu")) type = CL_DEVICE_TYPE_CPU;
+        else type = CL_DEVICE_TYPE_ACCELERATOR;
+      }
+      acc_opencl_ndevices = 0;
+      for (i = 0; i < nplatforms; ++i) {
+        if (EXIT_SUCCESS == result
+          && CL_SUCCESS == clGetDeviceIDs(platforms[i], type, 0, NULL, &ndevices))
+        {
+          const int n = (acc_opencl_ndevices + ndevices) < ACC_OPENCL_DEVICES_MAXCOUNT
+            ? (int)ndevices : (ACC_OPENCL_DEVICES_MAXCOUNT - acc_opencl_ndevices);
+          if (CL_SUCCESS == clGetDeviceIDs(platforms[i], type,
+            n, acc_opencl_devices + acc_opencl_ndevices, NULL))
+          {
+            acc_opencl_ndevices += n;
+          }
+          else {
+            ACC_OPENCL_ERROR("retrieve device ids", result);
+          }
+        }
+      }
+      assert(NULL == acc_opencl_context);
+      if (device_id < acc_opencl_ndevices) {
+        if (NULL != env_device_vendor && '\0' != *env_device_vendor) {
+          for (i = 0; i < (cl_uint)acc_opencl_ndevices;) {
+            buffer[0] = '\0';
+            if (CL_SUCCESS == clGetDeviceInfo(acc_opencl_devices[i],
+              CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL))
+            {
+              if (NULL == acc_opencl_stristr(buffer, env_device_vendor)) {
+                --acc_opencl_ndevices;
+                if (i < (cl_uint)acc_opencl_ndevices) { /* keep relative order of IDs */
+                  memmove(acc_opencl_devices + i, acc_opencl_devices + i + 1,
+                    sizeof(cl_device_id) * (acc_opencl_ndevices - i));
+                }
+              }
+              else ++i;
+            }
+            else {
+              ACC_OPENCL_ERROR("retrieve device vendor", result);
+              break;
+            }
+          }
+        }
+      }
+      if (device_id < acc_opencl_ndevices) {
+        if (EXIT_SUCCESS == result && 1 < acc_opencl_ndevices) {
+          /* reorder devices according to acc_opencl_order_devices */
+          qsort(acc_opencl_devices, acc_opencl_ndevices,
+            sizeof(cl_device_id), acc_opencl_order_devices);
+          /* preselect default device */
+          if (NULL == env_device_id || '\0' == *env_device_id) {
+            for (i = 0; i < (cl_uint)acc_opencl_ndevices; ++i) {
+              ACC_OPENCL_CHECK(clGetDeviceInfo(acc_opencl_devices[i],
+                CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL),
+                "retrieve device type", result);
+              if (CL_DEVICE_TYPE_DEFAULT & type) {
+                device_id = (int)i;
+                break;
+              }
+            }
+          }
+        }
+        if (EXIT_SUCCESS == result) {
+          cl_device_id active_device;
+          result = acc_opencl_set_active_device(device_id, &active_device);
+#if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
+          if (EXIT_SUCCESS == result) {
+            const cl_context context = acc_opencl_context;
+#           pragma omp parallel
+            if (context != acc_opencl_context) {
+              if (CL_SUCCESS == clRetainContext(context)) {
+                acc_opencl_context = context;
+              }
+              else {
+                ACC_OPENCL_ERROR("retain context", result);
+                acc_opencl_context = NULL;
+              }
+            }
+          }
+#endif
+#if defined(ACC_OPENCL_MEM_ASYNC)
+          if (EXIT_SUCCESS == result) {
+            const char *const env = getenv("ACC_OPENCL_ASYNC_MEMOPS");
+            if (NULL == env) {
+              const int confirmation = acc_opencl_device_vendor(active_device, "nvidia");
+              acc_opencl_options.async_memops = (EXIT_SUCCESS != confirmation);
+            }
+            else acc_opencl_options.async_memops = (0 != atoi(env));
+          }
+          else
+#endif
+          acc_opencl_options.async_memops = CL_FALSE;
+#if defined(ACC_OPENCL_SVM)
+          if (EXIT_SUCCESS == result) {
+            const char *const env = getenv("ACC_OPENCL_SVM");
+            int level_major = 0;
+            acc_opencl_options.svm_interop = (NULL == env || 0 != atoi(env)) &&
+              (EXIT_SUCCESS == acc_opencl_device_level(active_device,
+                &level_major, NULL/*level_minor*/) && 2 <= level_major);
+          }
+          else
+#endif
+          acc_opencl_options.svm_interop = CL_FALSE;
+        }
+      }
+      else { /* mark as initialized */
+        acc_opencl_ndevices = -1;
+      }
+    }
+    else { /* mark as initialized */
+      acc_opencl_ndevices = -1;
+    }
+#if defined(__DBCSR_ACC)
+    /* DBCSR shall call acc_init as well as libsmm_acc_init (since both interfaces are used).
+     * Also, libsmm_acc_init may privately call acc_init (as it depends on the ACC interface).
+     * The implementation of acc_init should hence be safe against "over initialization".
+     * However, DBCSR only calls acc_init (and expects an implicit libsmm_acc_init).
+     */
+    if (EXIT_SUCCESS == result) {
+      result = libsmm_acc_init();
+    }
+#endif
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_finalize(void)
+{
+#if defined(_OPENMP)
+  /* initialization/finalization is not meant to be thread-safe */
+  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+#else
+  int result = EXIT_SUCCESS;
+#endif
+  if (NULL != acc_opencl_context) {
+    const cl_context context = acc_opencl_context;
+    assert(0 < acc_opencl_ndevices);
+#if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
+#   pragma omp parallel
+    if (context != acc_opencl_context) {
+      ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context),
+        "release context", result);
+      acc_opencl_context = NULL;
+    }
+#endif
+    ACC_OPENCL_CHECK(clReleaseContext(context),
+      "release context", result);
+    acc_opencl_context = NULL;
+#if defined(__DBCSR_ACC)
+    /* DBCSR may call acc_init() as well as libsmm_acc_init() since both interface are used.
+     * libsmm_acc_init may privately call acc_init (as it depends on the ACC interface).
+     * The implementation of acc_init() should be safe against "over initialization".
+     * However, DBCSR only calls acc_init() and expects an implicit libsmm_acc_init().
+     */
+    if (EXIT_SUCCESS == result) {
+      result = libsmm_acc_finalize();
+    }
+#endif
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+void acc_clear_errors(void)
+{
+}
+
+
+int acc_get_ndevices(int* ndevices)
+{
+  int result;
+
+#if defined(__DBCSR_ACC)
+  /* DBCSR calls acc_get_ndevices before calling acc_init(). */
+  result = acc_init();
+  if (EXIT_SUCCESS == result)
+#endif
+  {
+    if (NULL != ndevices && 0 != acc_opencl_ndevices) {
+      *ndevices = (0 < acc_opencl_ndevices ? acc_opencl_ndevices : 0);
+      result = EXIT_SUCCESS;
+    }
+    else {
+      result = EXIT_FAILURE;
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_device(void* stream, cl_device_id* device)
+{
+  int result = EXIT_SUCCESS;
+  assert(NULL != device);
+  if (NULL != stream) {
+    ACC_OPENCL_CHECK(clGetCommandQueueInfo(*ACC_OPENCL_STREAM(stream), CL_QUEUE_DEVICE,
+      sizeof(cl_device_id), device, NULL), "retrieve device from queue", result);
+  }
+  else if (NULL != acc_opencl_context) {
+#if !defined(NDEBUG)
+    size_t n = sizeof(cl_device_id);
+    ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES,
+      sizeof(cl_device_id), device, &n), "retrieve id of active device", result);
+#else
+    ACC_OPENCL_CHECK(clGetContextInfo(acc_opencl_context, CL_CONTEXT_DEVICES,
+      sizeof(cl_device_id), device, NULL), "retrieve id of active device", result);
+#endif
+    assert(EXIT_SUCCESS != result || sizeof(cl_device_id) == n/*single-device context*/);
+  }
+  else {
+    *device = NULL;
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_device_vendor(cl_device_id device, const char* vendor)
+{
+  char buffer[ACC_OPENCL_BUFFERSIZE];
+  int result = EXIT_SUCCESS;
+  assert(NULL != device && NULL != vendor);
+  buffer[0] = '\0';
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device,
+    CL_DEVICE_VENDOR, ACC_OPENCL_BUFFERSIZE, buffer, NULL),
+    "retrieve device vendor", result);
+  if (EXIT_SUCCESS == result) {
+    return (NULL != acc_opencl_stristr(buffer, vendor)
+      ? EXIT_SUCCESS
+      : EXIT_FAILURE);
+  }
+  else ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor)
+{
+  char buffer[ACC_OPENCL_BUFFERSIZE];
+  int result = EXIT_SUCCESS;
+  assert(NULL != device && (NULL != level_major || NULL != level_minor));
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_VERSION,
+    ACC_OPENCL_BUFFERSIZE, buffer, NULL),
+    "retrieve device level", result);
+  if (EXIT_SUCCESS == result) {
+    unsigned int level[2];
+    /* input: "OpenCL <level_major>.<level_minor> ..." */
+    if (2 == sscanf(buffer, "%*s %u.%u", level, level+1)) {
+      if (NULL != level_major) *level_major = (int)level[0];
+      if (NULL != level_minor) *level_minor = (int)level[1];
+    }
+    else {
+      result = EXIT_SUCCESS;
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_device_ext(cl_device_id device, const char *const extnames[], int num_exts)
+{
+  int result = ((NULL != extnames && 0 < num_exts) ? EXIT_SUCCESS : EXIT_FAILURE);
+  char extensions[ACC_OPENCL_BUFFERSIZE], buffer[ACC_OPENCL_BUFFERSIZE];
+  assert(NULL != device);
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS,
+    ACC_OPENCL_BUFFERSIZE, extensions, NULL),
+    "retrieve device extensions", result);
+  if (EXIT_SUCCESS == result) {
+    do {
+      --num_exts;
+      if (NULL == extnames[num_exts]) {
+        return EXIT_FAILURE;
+      }
+      else {
+        char *const exts = strncpy(buffer, extnames[num_exts], ACC_OPENCL_BUFFERSIZE - 1);
+        const char* ext = strtok(exts, ACC_OPENCL_DELIMS);
+        for (; NULL != ext; ext = strtok(NULL, ACC_OPENCL_DELIMS)) {
+          if (NULL == strstr(extensions, ext)) {
+            return EXIT_FAILURE;
+          }
+        }
+      }
+    } while (0 < num_exts);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_set_active_device(int device_id, cl_device_id* device)
+{
+  cl_int result = (((0 <= device_id && device_id < acc_opencl_ndevices) ||
+    /* allow successful completion if no device was found */
+    0 > acc_opencl_ndevices) ? EXIT_SUCCESS : EXIT_FAILURE);
+  if (0 < acc_opencl_ndevices) {
+    const cl_device_id active_id = acc_opencl_devices[device_id];
+    cl_device_id current_id = NULL;
+    if (EXIT_SUCCESS == result) result = acc_opencl_device(NULL/*stream*/, &current_id);
+    if (EXIT_SUCCESS == result && active_id != current_id) {
+      if (NULL != acc_opencl_context) {
+        ACC_OPENCL_CHECK(clReleaseContext(acc_opencl_context),
+          "release context", result);
+      }
+      if (EXIT_SUCCESS == result) {
+        cl_context_properties properties[] = {
+          /* insert other properties in front of below property */
+          CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE, /* TODO */
+          0 /* end of properties */
+        };
+#if defined(NDEBUG)
+        void (*const notify)(const char*, const void*, size_t, void*) = NULL;
+#else
+        void (*const notify)(const char*, const void*, size_t, void*) = acc_opencl_notify;
+#endif
+        acc_opencl_context = clCreateContext(properties,
+          1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
+        if (CL_INVALID_VALUE == result) { /* retry */
+          const size_t n = sizeof(properties) / sizeof(*properties);
+          assert(3 <= n);
+          properties[n-3] = 0;
+          acc_opencl_context = clCreateContext(0 != properties[0] ? properties : NULL,
+            1/*num_devices*/, &active_id, notify, NULL/* user_data*/, &result);
+        }
+        ACC_OPENCL_CHECK(result, "create context", result);
+      }
+    }
+    if (NULL != device) {
+      *device = (EXIT_SUCCESS == result ? active_id : NULL);
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_set_active_device(int device_id)
+{
+  ACC_OPENCL_RETURN(acc_opencl_set_active_device(device_id, NULL/*device*/));
+}
+
+
+int acc_opencl_wgsize(cl_device_id device, cl_kernel kernel,
+  int* max_value, int* preferred_multiple)
+{
+  int result = (NULL != device && (NULL != preferred_multiple
+                                || NULL != max_value))
+    ? EXIT_SUCCESS : EXIT_FAILURE;
+  if (NULL != kernel) { /* kernel-specific */
+    if (NULL != max_value) {
+      size_t value = 0;
+      ACC_OPENCL_CHECK(clGetKernelWorkGroupInfo(kernel, device,
+        CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &value, NULL),
+        "query maximum WG-size of kernel", result);
+      assert(value <= INT_MAX);
+      *max_value = (int)value;
+    }
+    if (NULL != preferred_multiple) {
+      size_t value = 0;
+      ACC_OPENCL_CHECK(clGetKernelWorkGroupInfo(kernel, device,
+        CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+        sizeof(size_t), &value, NULL),
+        "query preferred multiple of WG-size of kernel", result);
+      assert(value <= INT_MAX);
+      *preferred_multiple = (int)value;
+    }
+  }
+  else { /* device-specific */
+    if (NULL != max_value) {
+      size_t value = 0;
+      ACC_OPENCL_CHECK(clGetDeviceInfo(device,
+        CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value, NULL),
+        "query maximum WG-size of device", result);
+      assert(value <= INT_MAX);
+      *max_value = (int)value;
+    }
+    if (NULL != preferred_multiple) {
+#if defined(CL_VERSION_3_0)
+      size_t value = 0;
+      ACC_OPENCL_CHECK(clGetDeviceInfo(device,
+        CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+        sizeof(size_t), &value, NULL),
+        "query preferred multiple of WG-size of device", result);
+      assert(value <= INT_MAX);
+      *preferred_multiple = (int)value;
+#else
+      result = EXIT_FAILURE;
+#endif
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_kernel(const char* source, const char* build_options,
+  const char* kernel_name, cl_kernel* kernel)
+{
+  char buffer[ACC_OPENCL_BUFFERSIZE] = "\0";
+  cl_int result;
+  assert(NULL != kernel);
+  if (NULL != acc_opencl_context) {
+    const cl_program program = clCreateProgramWithSource(
+      acc_opencl_context, 1/*nlines*/, &source, NULL, &result);
+    if (NULL != program) {
+      cl_device_id active_id = NULL;
+      assert(CL_SUCCESS == result);
+      result = acc_opencl_device(NULL/*stream*/, &active_id);
+      if (EXIT_SUCCESS == result) {
+        result = clBuildProgram(program,
+        1/*num_devices*/, &active_id, build_options,
+        NULL/*callback*/, NULL/*user_data*/);
+        if (CL_SUCCESS == result) {
+          *kernel = clCreateKernel(program, kernel_name, &result);
+          if (CL_SUCCESS == result) assert(NULL != *kernel);
+          else {
+            ACC_OPENCL_ERROR("create kernel", result);
+          }
+        }
+        else {
+          clGetProgramBuildInfo(program, active_id, CL_PROGRAM_BUILD_LOG,
+            ACC_OPENCL_BUFFERSIZE, buffer, NULL); /* ignore retval */
+          *kernel = NULL;
+        }
+      }
+      else {
+        *kernel = NULL;
+      }
+    }
+    else {
+      assert(CL_SUCCESS != result);
+      ACC_OPENCL_ERROR("create program", result);
+      *kernel = NULL;
+    }
+  }
+  else {
+    result = EXIT_FAILURE;
+    *kernel = NULL;
+  }
+  ACC_OPENCL_RETURN_CAUSE(result, buffer);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*__OPENCL*/
diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
new file mode 100644
index 00000000000..f0c0988c8b9
--- /dev/null
+++ b/src/acc/opencl/acc_opencl.h
@@ -0,0 +1,247 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#ifndef ACC_OPENCL_H
+#define ACC_OPENCL_H
+
+#if !defined(CL_TARGET_OPENCL_VERSION)
+# define CL_TARGET_OPENCL_VERSION 220
+#endif
+
+#if defined(__OPENCL)
+# if defined(__APPLE__)
+#   include <OpenCL/cl.h>
+# else
+#   include <CL/cl.h>
+# endif
+#else
+# error Definition of __OPENCL preprocessor symbol is missing!
+#endif
+
+#if !defined(ACC_OPENCL_NOEXT)
+# if defined(__APPLE__)
+#   include <OpenCL/cl_ext.h>
+# else
+#   include <CL/cl_ext.h>
+# endif
+#endif
+
+#include "../acc.h"
+#if !defined(NDEBUG)
+# include <assert.h>
+#endif
+#include <stdio.h>
+
+#if !defined(ACC_OPENCL_CACHELINE_NBYTES)
+# define ACC_OPENCL_CACHELINE_NBYTES 64
+#endif
+#if !defined(ACC_OPENCL_MAXALIGN_NBYTES)
+# define ACC_OPENCL_MAXALIGN_NBYTES (2 << 20/*2MB*/)
+#endif
+#if !defined(ACC_OPENCL_BUFFERSIZE)
+# define ACC_OPENCL_BUFFERSIZE (8 << 10/*8KB*/)
+#endif
+#if !defined(ACC_OPENCL_DEVICES_MAXCOUNT)
+# define ACC_OPENCL_DEVICES_MAXCOUNT 32
+#endif
+
+/* can depend on OpenCL implementation */
+#if !defined(ACC_OPENCL_MEM_NOALLOC) && 1
+# define ACC_OPENCL_MEM_NOALLOC
+# define ACC_OPENCL_MEM(A) ((cl_mem*)&(A))
+#else
+# define ACC_OPENCL_MEM(A) ((cl_mem*)(A))
+#endif
+#if !defined(ACC_OPENCL_STREAM_NOALLOC) && 1
+# define ACC_OPENCL_STREAM_NOALLOC
+# define ACC_OPENCL_STREAM(A) ((cl_command_queue*)&(A))
+#else
+# define ACC_OPENCL_STREAM(A) ((cl_command_queue*)(A))
+#endif
+#if !defined(ACC_OPENCL_EVENT_NOALLOC) && 0
+/* incompatible with acc_event_record */
+# define ACC_OPENCL_EVENT_NOALLOC
+# define ACC_OPENCL_EVENT(A) ((cl_event*)&(A))
+#else
+# define ACC_OPENCL_EVENT(A) ((cl_event*)(A))
+#endif
+
+#if !defined(ACC_OPENCL_THREADLOCAL_CONTEXT) && 1
+# define ACC_OPENCL_THREADLOCAL_CONTEXT
+#endif
+#if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 1
+# define ACC_OPENCL_STREAM_PRIORITIES
+#endif
+#if !defined(ACC_OPENCL_STREAM_SYNCFLUSH) && 0
+# define ACC_OPENCL_STREAM_FINISH
+#endif
+#if !defined(ACC_OPENCL_EVENT_BARRIER) && 0
+# define ACC_OPENCL_EVENT_BARRIER
+#endif
+#if !defined(ACC_OPENCL_MEM_ASYNC) && 1
+# define ACC_OPENCL_MEM_ASYNC
+#endif
+#if !defined(ACC_OPENCL_VERBOSE) && 0
+# define ACC_OPENCL_VERBOSE
+#endif
+#if !defined(ACC_OPENCL_SVM) && 0
+# if defined(CL_VERSION_2_0)
+#   define ACC_OPENCL_SVM
+# endif
+#endif
+
+#if defined(CL_VERSION_2_0)
+# define ACC_OPENCL_COMMAND_QUEUE_PROPERTIES cl_queue_properties
+#else
+# define ACC_OPENCL_COMMAND_QUEUE_PROPERTIES cl_int
+#endif
+
+#define ACC_OPENCL_UP2(N, NPOT) ((((uint64_t)N) + ((NPOT) - 1)) & ~((NPOT) - 1))
+#define ACC_OPENCL_UNUSED(VAR) (void)(VAR)
+
+#if defined(__cplusplus)
+# if defined(__GNUC__) || defined(_CRAYC)
+#   define ACC_OPENCL_FUNCNAME __PRETTY_FUNCTION__
+# elif defined(_MSC_VER)
+#   define ACC_OPENCL_FUNCNAME __FUNCDNAME__
+# else
+#   define ACC_OPENCL_FUNCNAME __FUNCNAME__
+# endif
+#else
+# if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__) /*C99*/
+#   define ACC_OPENCL_FUNCNAME __func__
+# elif defined(_MSC_VER)
+#   define ACC_OPENCL_FUNCNAME __FUNCDNAME__/*__FUNCTION__*/
+# elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
+#   define ACC_OPENCL_FUNCNAME __PRETTY_FUNCTION__
+# else
+#   define ACC_OPENCL_FUNCNAME ""
+# endif
+#endif
+
+#if defined(__STDC_VERSION__) && (199901L <= __STDC_VERSION__ || defined(__GNUC__))
+# define ACC_OPENCL_SNPRINTF(S, N, ...) snprintf(S, N, __VA_ARGS__)
+#else
+# define ACC_OPENCL_SNPRINTF(S, N, ...) sprintf((S) + /*unused*/(N) * 0, __VA_ARGS__)
+#endif
+
+#if defined(_DEBUG)
+# define ACC_OPENCL_DEBUG_PRINTF(A, ...) printf(A, __VA_ARGS__)
+#else
+# define ACC_OPENCL_DEBUG_PRINTF(A, ...)
+#endif
+
+#if defined(NDEBUG)
+# define ACC_OPENCL_EXPECT(EXPECTED, EXPR) (EXPR)
+# define ACC_OPENCL_ERROR(MSG, RESULT) (RESULT) = EXIT_FAILURE
+# define ACC_OPENCL_RETURN_CAUSE(RESULT, CAUSE) ACC_OPENCL_UNUSED(CAUSE); return RESULT
+#else
+# define ACC_OPENCL_EXPECT(EXPECTED, EXPR) assert((EXPECTED) == (EXPR))
+# define ACC_OPENCL_ERROR(MSG, RESULT) do { \
+    if (-1001 != (RESULT)) { \
+      fprintf(stderr, "ERROR ACC/OpenCL: " MSG); \
+      if (EXIT_FAILURE != (RESULT)) { \
+        fprintf(stderr, " (code=%i)", RESULT); \
+      } \
+      fprintf(stderr, ".\n"); \
+      assert(CL_SUCCESS != (RESULT)); \
+    } \
+    else { \
+      fprintf(stderr, "ERROR ACC/OpenCL: incomplete installation (" MSG ").\n"); \
+    } \
+    assert(!MSG); \
+    (RESULT) = EXIT_FAILURE; \
+  } while (0)
+# define ACC_OPENCL_RETURN_CAUSE(RESULT, CAUSE) do { \
+    const int acc_opencl_return_cause_result_ = (RESULT); \
+    if (EXIT_SUCCESS != acc_opencl_return_cause_result_) { \
+      fprintf(stderr, "ERROR ACC/OpenCL: failed for %s!\n", \
+        (NULL != (CAUSE) && '\0' != *(const char*)(CAUSE)) \
+          ? ((const char*)CAUSE) \
+          : (ACC_OPENCL_FUNCNAME)); \
+      assert(!"SUCCESS"); \
+    } \
+    return acc_opencl_return_cause_result_; \
+  } while (0)
+#endif
+#define ACC_OPENCL_RETURN(RESULT) ACC_OPENCL_RETURN_CAUSE(RESULT, NULL)
+
+#define ACC_OPENCL_CHECK(EXPR, MSG, RESULT) do { \
+  if (EXIT_SUCCESS == (RESULT)) { \
+    (RESULT) = (EXPR); assert((MSG) && *(MSG)); \
+    if (CL_SUCCESS == (RESULT)) { \
+      (RESULT) = EXIT_SUCCESS; \
+    } \
+    else { \
+      ACC_OPENCL_ERROR(MSG, RESULT); \
+    } \
+  } \
+} while (0)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** Settings depending on OpenCL vendor or standard level (discovered/setup in acc_init). */
+typedef struct acc_opencl_options_t {
+  /** Asynchronous memory operations may crash for some OpenCL implementations. */
+  cl_bool async_memops;
+  cl_bool svm_interop;
+} acc_opencl_options_t;
+
+extern acc_opencl_options_t acc_opencl_options;
+
+/* non-zero if library is initialized, zero devices is signaled by nagative value */
+extern int acc_opencl_ndevices;
+/* allow a context per each OpenMP thread */
+extern cl_context acc_opencl_context;
+#if defined(_OPENMP) && defined(ACC_OPENCL_THREADLOCAL_CONTEXT)
+# pragma omp threadprivate(acc_opencl_context)
+#endif
+
+typedef struct acc_opencl_info_hostptr_t {
+  cl_mem buffer;
+  void* mapped;
+} acc_opencl_info_hostptr_t;
+
+/** Information about host-memory pointer (acc_host_mem_allocate). */
+acc_opencl_info_hostptr_t* acc_opencl_info_hostptr(void* memory);
+/** Get host-pointer associated with device-memory (acc_dev_mem_allocate). */
+void* acc_opencl_get_hostptr(cl_mem memory);
+/** Information about amount of device memory. */
+int acc_opencl_info_devmem(cl_device_id device,
+  size_t* mem_free, size_t* mem_total);
+/** Return the pointer to the 1st match of "b" in "a", or NULL (no match). */
+const char* acc_opencl_stristr(const char* a, const char* b);
+/** Get active device (can be thread/queue-specific). */
+int acc_opencl_device(void* stream, cl_device_id* device);
+/** Confirm the vendor of the given device. */
+int acc_opencl_device_vendor(cl_device_id device, const char* vendor);
+/** Return the OpenCL support level for the given device. */
+int acc_opencl_device_level(cl_device_id device,
+  int* level_major, int* level_minor);
+/** Check if given device supports the extensions. */
+int acc_opencl_device_ext(cl_device_id device,
+  const char *const extnames[], int num_exts);
+/** Internal flavor of acc_set_active_device; yields cl_device_id. */
+int acc_opencl_set_active_device(int device_id, cl_device_id* device);
+/** Get preferred multiple and max. size of workgroup (kernel- or device-specific). */
+int acc_opencl_wgsize(cl_device_id device, cl_kernel kernel,
+  int* max_value, int* preferred_multiple);
+/** Build kernel function with given name from source using given build_options. */
+int acc_opencl_kernel(const char* source, const char* build_options,
+  const char* kernel_name, cl_kernel* kernel);
+/** Create command queue (stream). */
+int acc_opencl_stream_create(cl_command_queue* stream_p, const char* name,
+  const ACC_OPENCL_COMMAND_QUEUE_PROPERTIES* properties);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*ACC_OPENCL_H*/
diff --git a/src/acc/opencl/acc_opencl.sh b/src/acc/opencl/acc_opencl.sh
new file mode 100755
index 00000000000..d6c3ea8dabb
--- /dev/null
+++ b/src/acc/opencl/acc_opencl.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+####################################################################################################
+# Copyright (C) by the DBCSR developers group - All rights reserved                                #
+# This file is part of the DBCSR library.                                                          #
+#                                                                                                  #
+# For information on the license, see the LICENSE file.                                            #
+# For further information please visit https://dbcsr.cp2k.org                                      #
+# SPDX-License-Identifier: GPL-2.0+                                                                #
+####################################################################################################
+
+BASENAME=$(command -v basename)
+SED=$(command -v gsed)
+CPP=$(command -v cpp)
+RM=$(command -v rm)
+
+# GNU sed is desired (macOS)
+if [ "" = "${SED}" ]; then
+  SED=$(command -v sed)
+fi
+
+if [ "${BASENAME}" ] && [ "${SED}" ] && [ "${RM}" ]; then
+  for OFILE in "$@"; do :; done
+  if [ "$#" -gt 1 ]; then
+    # allow for instance /dev/stdout
+    if [ "${OFILE##*.}" = "h" ]; then
+      truncate -s0 "${OFILE}"
+    elif [ "${OFILE##*.}" = "cl" ] || [ "${OFILE##*.}" = "csv" ]; then
+      >&2 echo "ERROR: no output/header file given!"
+      exit 1
+    fi
+    NFILES_OCL=0
+    NFILES_CSV=0
+    for IFILE in "$@"; do
+      if [ "${IFILE}" != "${OFILE}" ]; then
+        if [ "${IFILE##*.}" = "cl" ]; then
+          if [ -e "${IFILE}" ]; then
+            BNAME=$(${BASENAME} "${IFILE}" .cl)
+            VNAME=opencl_libsmm_source_${BNAME}
+            MNAME=$(echo "${VNAME}" | tr '[:lower:]' '[:upper:]')
+            echo "#define ${MNAME} ${VNAME}" >>"${OFILE}"
+            echo "const char ${VNAME}[] =" >>"${OFILE}"
+            printf '  \"#pragma OPENCL EXTENSION all: enable\\n\"\n' >>"${OFILE}"
+            if [ "${CPP}" ] && \
+               [ "$(${CPP} -dD -P -fpreprocessed "${IFILE}" 2>/dev/null >/dev/null && echo "OK")" ];
+            then
+              ${CPP} -dD -P -fpreprocessed "${IFILE}"
+            else # fallback to sed
+              ${SED} -r ':a;s%(.*)/\*.*\*/%\1%;ta;/\/\*/!b;N;ba' "${IFILE}"
+            fi | \
+            ${SED} \
+              -e '/^[[:space:]]*$/d' -e 's/[[:space:]]*$//' \
+              -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/^/  "/' -e 's/$/\\n"/' \
+              >>"${OFILE}"
+            echo ";" >>"${OFILE}"
+            NFILES_OCL=$((NFILES_OCL+1))
+          else
+            >&2 echo "ERROR: ${IFILE} does not exist!"
+            rm -f "${OFILE}"
+            exit 1
+          fi
+        elif [ "${IFILE##*.}" = "csv" ]; then
+          # non-existence does not trigger an error
+          if [ -e "${IFILE}" ]; then
+            VNAME=opencl_libsmm_params_smm
+            MNAME=$(echo "${VNAME}" | tr '[:lower:]' '[:upper:]')
+            echo "#define ${MNAME} ${VNAME}" >>"${OFILE}"
+            echo "const char ${VNAME}[] =" >>"${OFILE}"
+            ${SED} 's/^/  "/;s/$/\\n"/;1d' "${IFILE}" >>"${OFILE}"
+            echo ";" >>"${OFILE}"
+            NFILES_CSV=$((NFILES_CSV+1))
+          fi
+        else
+          >&2 echo "ERROR: ${IFILE} is not an OpenCL or CSV file!"
+          rm -f "${OFILE}"
+          exit 1
+        fi
+      fi
+    done
+    if [ "0" = "${NFILES_OCL}" ]; then
+      >&2 echo "ERROR: no OpenCL file was given!"
+      rm -f "${OFILE}"
+      exit 1
+    elif [ "0" != "$((1<NFILES_CSV))" ]; then
+      >&2 echo "ERROR: more than one CSV file was given!"
+      rm -f "${OFILE}"
+      exit 1
+    fi
+  else
+    echo "Usage: $0 infile.cl [infile2.cl .. infileN.cl] [infile.csv] outfile.h"
+    echo "       At least one OpenCL file must be supplied."
+    echo "       Parameters per CSV file are optional."
+    echo "       The CSV file can be at any position."
+  fi
+else
+  >&2 echo "ERROR: missing prerequisites!"
+  exit 1
+fi
diff --git a/src/acc/opencl/acc_opencl_event.c b/src/acc/opencl/acc_opencl_event.c
new file mode 100644
index 00000000000..635430ebb52
--- /dev/null
+++ b/src/acc/opencl/acc_opencl_event.c
@@ -0,0 +1,133 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#if defined(__OPENCL)
+#include "acc_opencl.h"
+#include <stdlib.h>
+#include <assert.h>
+
+#if defined(CL_VERSION_1_2)
+# if defined(ACC_OPENCL_EVENT_BARRIER)
+#   define ACC_OPENCL_ENQUEUE_EVENT(QUEUE, EVENT) clEnqueueBarrierWithWaitList(QUEUE, 0, NULL, EVENT)
+# else
+#   define ACC_OPENCL_ENQUEUE_EVENT(QUEUE, EVENT) clEnqueueMarkerWithWaitList(QUEUE, 0, NULL, EVENT)
+# endif
+#else
+# define ACC_OPENCL_ENQUEUE_EVENT(QUEUE, EVENT) clEnqueueMarker(QUEUE, EVENT)
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int acc_event_create(void** event_p)
+{
+  cl_int result = EXIT_SUCCESS;
+  const cl_event event = clCreateUserEvent(acc_opencl_context, &result);
+  assert(NULL != event_p);
+  if (NULL != event) {
+    cl_int status = CL_COMPLETE;
+    assert(CL_SUCCESS == result);
+    /* an empty event (unrecorded) has no work to wait for; hence it is
+     * considered occurred and acc_event_synchronize must not block
+     */
+    if (CL_SUCCESS == clSetUserEventStatus(event, status)) {
+#if defined(ACC_OPENCL_EVENT_NOALLOC)
+      assert(sizeof(void*) >= sizeof(cl_event));
+      *event_p = (void*)event;
+#else
+      *event_p = malloc(sizeof(cl_event));
+      if (NULL != *event_p) {
+        *(cl_event*)*event_p = event;
+        result = EXIT_SUCCESS;
+      }
+      else {
+        clReleaseEvent(event);
+        result = EXIT_FAILURE;
+      }
+#endif
+    }
+    else {
+      ACC_OPENCL_ERROR("set initial event state", result);
+      clReleaseEvent(event);
+      *event_p = NULL;
+    }
+  }
+  else {
+    assert(CL_SUCCESS != result);
+    ACC_OPENCL_ERROR("create user-defined event", result);
+    *event_p = NULL;
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_event_destroy(void* event)
+{
+  int result = EXIT_SUCCESS;
+  if (NULL != event) {
+    ACC_OPENCL_CHECK(clReleaseEvent(*ACC_OPENCL_EVENT(event)),
+      "release user-defined event", result);
+#if defined(ACC_OPENCL_EVENT_NOALLOC)
+    assert(sizeof(void*) >= sizeof(cl_event));
+#else
+    free(event);
+#endif
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_event_record(void* event, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert(NULL != event && NULL != stream);
+  ACC_OPENCL_CHECK(ACC_OPENCL_ENQUEUE_EVENT(*ACC_OPENCL_STREAM(stream), ACC_OPENCL_EVENT(event)),
+    "record event", result);
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_event_query(void* event, acc_bool_t* has_occurred)
+{
+  int result = EXIT_SUCCESS;
+  cl_int status = CL_COMPLETE;
+  if (NULL != event) {
+#if defined(ACC_OPENCL_STREAM_SYNCFLUSH)
+    ACC_OPENCL_CHECK(clFlush(*ACC_OPENCL_STREAM(stream)), "flush stream", result);
+#endif
+    ACC_OPENCL_CHECK(clGetEventInfo(*ACC_OPENCL_EVENT(event), CL_EVENT_COMMAND_EXECUTION_STATUS,
+      sizeof(cl_int), &status, NULL), "retrieve event status", result);
+  }
+  assert(NULL != has_occurred);
+  *has_occurred = (CL_COMPLETE == status || 0 > status);
+#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+  fprintf(stderr, "acc_event_query(%p, %i)\n", event, *has_occurred);
+#endif
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_event_synchronize(void* event)
+{ /* Waits on the host-side. */
+  int result = EXIT_SUCCESS;
+  assert(NULL != event);
+#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+  fprintf(stderr, "acc_event_synchronize(%p)\n", event);
+#endif
+  ACC_OPENCL_CHECK(clWaitForEvents(1, ACC_OPENCL_EVENT(event)),
+    "synchronize event", result);
+  ACC_OPENCL_RETURN(result);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*__OPENCL*/
diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c
new file mode 100644
index 00000000000..ea4d89e7d8a
--- /dev/null
+++ b/src/acc/opencl/acc_opencl_mem.c
@@ -0,0 +1,369 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#if defined(__OPENCL)
+#include "acc_opencl.h"
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#if defined(_WIN32)
+# include <Windows.h>
+#else
+# if !defined(__linux__)
+#   include <sys/types.h>
+#   include <sys/sysctl.h>
+# endif
+# include <unistd.h>
+#endif
+
+#if !defined(ACC_OPENCL_MEM_MAPMULTI) && 0
+# define ACC_OPENCL_MEM_MAPMULTI
+#endif
+#if !defined(ACC_OPENCL_MEM_ALIGNSCALE)
+# define ACC_OPENCL_MEM_ALIGNSCALE 8
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int acc_opencl_memalignment(size_t /*size*/);
+int acc_opencl_memalignment(size_t size)
+{
+  int result;
+  if ((ACC_OPENCL_MEM_ALIGNSCALE * ACC_OPENCL_MAXALIGN_NBYTES) <= size) {
+    result = ACC_OPENCL_MAXALIGN_NBYTES;
+  }
+  else if ((ACC_OPENCL_MEM_ALIGNSCALE * ACC_OPENCL_CACHELINE_NBYTES) <= size) {
+    result = ACC_OPENCL_CACHELINE_NBYTES;
+  }
+  else {
+    result = sizeof(void*);
+  }
+  return result;
+}
+
+
+acc_opencl_info_hostptr_t* acc_opencl_info_hostptr(void* memory)
+{
+  assert(NULL == memory || sizeof(acc_opencl_info_hostptr_t) <= (uintptr_t)memory);
+  return (NULL != memory
+    ? (acc_opencl_info_hostptr_t*)((uintptr_t)memory - sizeof(acc_opencl_info_hostptr_t))
+    : (acc_opencl_info_hostptr_t*)NULL);
+}
+
+
+void* acc_opencl_get_hostptr(cl_mem memory)
+{
+  void* result = NULL;
+  assert(acc_opencl_options.svm_interop);
+  if (NULL != memory && CL_SUCCESS != clGetMemObjectInfo(memory, CL_MEM_HOST_PTR, sizeof(void*), &result, NULL)) {
+    assert(NULL == result);
+  }
+  return result;
+}
+
+
+int acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
+{
+  cl_int result;
+  const int alignment = acc_opencl_memalignment(nbytes);
+  const size_t size_meminfo = sizeof(acc_opencl_info_hostptr_t);
+  const size_t size = nbytes + alignment + size_meminfo - 1;
+  const cl_mem buffer = (
+#if defined(ACC_OPENCL_SVM)
+    acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, size,
+      clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, size, sizeof(void*)/*minimal alignment*/), &result) :
+#endif
+    clCreateBuffer(acc_opencl_context, CL_MEM_ALLOC_HOST_PTR, size, NULL/*host_ptr*/, &result));
+  assert(NULL != host_mem && NULL != stream);
+  if (NULL != buffer) {
+    const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
+    const uintptr_t address = (uintptr_t)clEnqueueMapBuffer(queue, buffer,
+      !acc_opencl_options.async_memops, CL_MAP_READ | CL_MAP_WRITE,
+      0/*offset*/, size, 0, NULL, NULL, &result);
+    if (0 != address) {
+      const uintptr_t aligned = ACC_OPENCL_UP2(address + size_meminfo, alignment);
+      acc_opencl_info_hostptr_t* meminfo;
+      assert(address + size_meminfo <= aligned);
+      assert(CL_SUCCESS == result);
+#if defined(ACC_OPENCL_MEM_MAPMULTI)
+      assert(0 < aligned - address - size_meminfo);
+      meminfo = (acc_opencl_info_hostptr_t*)clEnqueueMapBuffer(queue, buffer,
+        CL_TRUE/*blocking*/, CL_MAP_READ | CL_MAP_WRITE,
+        aligned - address - size_meminfo, size_meminfo, 0, NULL, NULL, &result);
+#else
+      meminfo = (acc_opencl_info_hostptr_t*)(aligned - size_meminfo);
+#endif
+      if (NULL != meminfo) {
+        meminfo->buffer = buffer;
+        meminfo->mapped = (void*)address;
+        *host_mem = (void*)aligned;
+      }
+      else {
+        ACC_OPENCL_ERROR("map buffer info", result);
+        *host_mem = NULL;
+      }
+    }
+    else {
+      assert(CL_SUCCESS != result);
+      ACC_OPENCL_ERROR("map host buffer", result);
+      *host_mem = NULL;
+    }
+  }
+  else {
+    assert(CL_SUCCESS != result);
+    ACC_OPENCL_ERROR("create host buffer", result);
+    *host_mem = NULL;
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_host_mem_deallocate(void* host_mem, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert(NULL != stream);
+  if (NULL != host_mem) {
+    acc_opencl_info_hostptr_t *const meminfo = acc_opencl_info_hostptr(host_mem);
+    const acc_opencl_info_hostptr_t info = *meminfo; /* copy meminfo prior to unmap */
+    const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
+    if (NULL != meminfo->buffer) {
+#if defined(ACC_OPENCL_MEM_MAPMULTI)
+      ACC_OPENCL_CHECK(clEnqueueUnmapMemObject(queue, meminfo->buffer, meminfo,
+        0, NULL, NULL), "unmap memory info", result);
+#endif
+      ACC_OPENCL_CHECK(clEnqueueUnmapMemObject(queue, info.buffer, info.mapped,
+        0, NULL, NULL), "unmap host memory", result);
+      ACC_OPENCL_CHECK(clReleaseMemObject(info.buffer),
+        "release host memory buffer", result);
+#if defined(ACC_OPENCL_SVM)
+      if (acc_opencl_options.svm_interop) clSVMFree(acc_opencl_context, info.mapped);
+#endif
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_dev_mem_allocate(void** dev_mem, size_t nbytes)
+{
+  cl_int result;
+  const cl_mem buffer = (
+#if defined(ACC_OPENCL_SVM)
+    acc_opencl_options.svm_interop ? clCreateBuffer(acc_opencl_context, CL_MEM_USE_HOST_PTR, nbytes,
+      clSVMAlloc(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, 0/*default alignment*/), &result) :
+#endif
+    clCreateBuffer(acc_opencl_context, CL_MEM_READ_WRITE, nbytes, NULL/*host_ptr*/, &result));
+  assert(NULL != dev_mem);
+  if (NULL != buffer) {
+#if defined(ACC_OPENCL_MEM_NOALLOC)
+    assert(sizeof(void*) >= sizeof(cl_mem));
+    *dev_mem = (void*)buffer;
+#else
+    *dev_mem = malloc(sizeof(cl_mem));
+    if (NULL != *dev_mem) {
+      *(cl_mem*)*dev_mem = buffer;
+      result = EXIT_SUCCESS;
+    }
+    else {
+#if defined(ACC_OPENCL_SVM)
+      void *const ptr = (acc_opencl_options.svm_interop
+        ? acc_opencl_get_hostptr(buffer) : NULL);
+#endif
+      clReleaseMemObject(buffer);
+#if defined(ACC_OPENCL_SVM)
+      /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr);
+#endif
+      result = EXIT_FAILURE;
+    }
+#endif
+  }
+  else {
+    assert(CL_SUCCESS != result);
+    ACC_OPENCL_ERROR("create device buffer", result);
+    *dev_mem = NULL;
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_dev_mem_deallocate(void* dev_mem)
+{
+  int result = EXIT_SUCCESS;
+  if (NULL != dev_mem) {
+    const cl_mem buffer = *ACC_OPENCL_MEM(dev_mem);
+#if defined(ACC_OPENCL_SVM)
+    void *const ptr = (acc_opencl_options.svm_interop
+      ? acc_opencl_get_hostptr(buffer) : NULL);
+#endif
+    ACC_OPENCL_CHECK(clReleaseMemObject(buffer),
+      "release device memory buffer", result);
+#if defined(ACC_OPENCL_MEM_NOALLOC)
+    assert(sizeof(void*) >= sizeof(cl_mem));
+#else
+    free(dev_mem);
+#endif
+#if defined(ACC_OPENCL_SVM)
+    /*if (NULL != ptr)*/ clSVMFree(acc_opencl_context, ptr);
+#endif
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_dev_mem_set_ptr(void** dev_mem, void* other, size_t lb)
+{
+  int result;
+  assert(NULL != dev_mem);
+  if (NULL != other || 0 == lb) {
+    *dev_mem = (char*)other + lb;
+    result = EXIT_SUCCESS;
+  }
+  else result = EXIT_FAILURE;
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert((NULL != host_mem || 0 == nbytes) && (NULL != dev_mem || 0 == nbytes) && NULL != stream);
+  if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) {
+    ACC_OPENCL_CHECK(clEnqueueWriteBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem),
+      !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
+      "enqueue h2d copy", result);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert((NULL != dev_mem || 0 == nbytes) && (NULL != host_mem || 0 == nbytes) && NULL != stream);
+  if (NULL != host_mem && NULL != dev_mem && 0 != nbytes) {
+    ACC_OPENCL_CHECK(clEnqueueReadBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem),
+      !acc_opencl_options.async_memops, 0/*offset*/, nbytes, host_mem, 0, NULL, NULL),
+      "enqueue d2h copy", result);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbytes, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert((NULL != devmem_src || 0 == nbytes) && (NULL != devmem_dst || 0 == nbytes) && NULL != stream);
+  if (NULL != devmem_src && NULL != devmem_dst && 0 != nbytes) {
+    ACC_OPENCL_CHECK(clEnqueueCopyBuffer(*ACC_OPENCL_STREAM(stream),
+      *ACC_OPENCL_MEM(devmem_src), *ACC_OPENCL_MEM(devmem_dst),
+      0/*src_offset*/, 0/*dst_offset*/, nbytes, 0, NULL, NULL),
+      "enqueue d2d copy", result);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_memset_zero(void* dev_mem, size_t offset, size_t nbytes, void* stream)
+{
+  int result = EXIT_SUCCESS;
+  assert((NULL != dev_mem || 0 == nbytes) && NULL != stream);
+  if (0 != nbytes) {
+    const cl_uchar pattern = 0; /* fill with zeros */
+    ACC_OPENCL_CHECK(clEnqueueFillBuffer(*ACC_OPENCL_STREAM(stream), *ACC_OPENCL_MEM(dev_mem),
+      &pattern, sizeof(pattern), offset, nbytes, 0, NULL, NULL),
+      "enqueue zeroing kernel", result);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total)
+{
+  int result = EXIT_SUCCESS;
+  size_t size_free = 0, size_total = 0;
+#if defined(_WIN32)
+  MEMORYSTATUSEX mem_status;
+  mem_status.dwLength = sizeof(mem_status);
+  if (GlobalMemoryStatusEx(&mem_status)) {
+    size_total = (size_t)mem_status.ullTotalPhys;
+    size_free  = (size_t)mem_status.ullAvailPhys;
+  }
+#else
+# if defined(_SC_PAGE_SIZE)
+  const long page_size = sysconf(_SC_PAGE_SIZE);
+# else
+  const long page_size = 4096;
+# endif
+# if defined(__linux__)
+#   if defined(_SC_PHYS_PAGES)
+  const long pages_total = sysconf(_SC_PHYS_PAGES);
+#   else
+  const long pages_total = 0;
+#   endif
+#   if defined(_SC_AVPHYS_PAGES)
+  const long pages_free = sysconf(_SC_AVPHYS_PAGES);
+#   else
+  const long pages_free = pages_total;
+#   endif
+# else
+  /*const*/ size_t size_pages_free = sizeof(const long), size_pages_total = sizeof(const long);
+  long pages_free = 0, pages_total = 0;
+  ACC_OPENCL_EXPECT(0, sysctlbyname("hw.memsize", &pages_total, &size_pages_total, NULL, 0));
+  if (0 < page_size) pages_total /= page_size;
+  if (0 != sysctlbyname("vm.page_free_count", &pages_free, &size_pages_free, NULL, 0)) {
+    pages_free = pages_total;
+  }
+# endif
+  if (0 < page_size && 0 <= pages_free && 0 <= pages_total) {
+    const size_t size_page = (size_t)page_size;
+    size_total = size_page * (size_t)pages_total;
+    size_free  = size_page * (size_t)pages_free;
+  }
+#endif
+  if (NULL != device) {
+    cl_ulong cl_size_total = 0;
+    ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE,
+      sizeof(cl_ulong), &cl_size_total, NULL), "retrieve amount of device memory", result);
+    assert(0 < acc_opencl_ndevices);
+    size_total /= acc_opencl_ndevices;
+    size_free  /= acc_opencl_ndevices;
+    if (EXIT_SUCCESS == result) {
+      if (cl_size_total < size_total) size_total = cl_size_total;
+      if (size_total < size_free) size_free = size_total;
+    }
+  }
+  result = (size_free <= size_total ? EXIT_SUCCESS : EXIT_FAILURE);
+  assert(NULL != mem_free || NULL != mem_total);
+  if (NULL != mem_total) *mem_total = size_total;
+  if (NULL != mem_free)  *mem_free  = size_free;
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_dev_mem_info(size_t* mem_free, size_t* mem_total)
+{
+  int result = EXIT_SUCCESS;
+  cl_device_id active_id = NULL;
+  if (NULL != acc_opencl_context) {
+    result = acc_opencl_device(NULL/*stream*/, &active_id);
+  }
+  if (EXIT_SUCCESS == result) {
+    result = acc_opencl_info_devmem(active_id, mem_free, mem_total);
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*__OPENCL*/
diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c
new file mode 100644
index 00000000000..8403eee79bb
--- /dev/null
+++ b/src/acc/opencl/acc_opencl_stream.c
@@ -0,0 +1,196 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#if defined(__OPENCL)
+#include "acc_opencl.h"
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#if defined(CL_VERSION_2_0)
+# define ACC_OPENCL_CREATE_COMMAND_QUEUE(CTX, DEV, PROPS, RESULT) \
+    clCreateCommandQueueWithProperties(CTX, DEV, PROPS, RESULT)
+#else
+# define ACC_OPENCL_CREATE_COMMAND_QUEUE(CTX, DEV, PROPS, RESULT) \
+    clCreateCommandQueue(CTX, DEV, /* avoid warning about unused argument */ \
+      (cl_command_queue_properties)(0 & (NULL != (PROPS) ? (((cl_int*)(PROPS))[0]) : 0)), RESULT)
+#endif
+
+#if defined(CL_VERSION_1_2)
+# if defined(ACC_OPENCL_EVENT_BARRIER)
+#   define ACC_OPENCL_WAIT_EVENT(QUEUE, EVENT) clEnqueueBarrierWithWaitList(QUEUE, 1, EVENT, NULL)
+# else
+#   define ACC_OPENCL_WAIT_EVENT(QUEUE, EVENT) clEnqueueMarkerWithWaitList(QUEUE, 1, EVENT, NULL)
+# endif
+#else
+# define ACC_OPENCL_WAIT_EVENT(QUEUE, EVENT) clEnqueueWaitForEvents(QUEUE, 1, EVENT)
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int acc_opencl_stream_create(cl_command_queue* stream_p, const char* name,
+  const ACC_OPENCL_COMMAND_QUEUE_PROPERTIES* properties)
+{
+  cl_int result = EXIT_SUCCESS;
+  assert(NULL != stream_p);
+  if (NULL != acc_opencl_context) {
+    cl_device_id device_id = NULL;
+    result = acc_opencl_device(NULL/*stream*/, &device_id);
+    if (EXIT_SUCCESS == result) {
+      *stream_p = ACC_OPENCL_CREATE_COMMAND_QUEUE(acc_opencl_context, device_id, properties, &result);
+    }
+    else {
+      ACC_OPENCL_ERROR("create command queue", result);
+    }
+  }
+  ACC_OPENCL_RETURN_CAUSE(result, name);
+}
+
+
+int acc_stream_create(void** stream_p, const char* name, int priority)
+{
+  cl_int result = EXIT_SUCCESS;
+  if (NULL != acc_opencl_context) {
+    cl_command_queue queue = NULL;
+#if !defined(ACC_OPENCL_STREAM_PRIORITIES) || !defined(CL_QUEUE_PRIORITY_KHR)
+    ACC_OPENCL_UNUSED(priority);
+#else
+    if (0 <= priority) {
+      ACC_OPENCL_COMMAND_QUEUE_PROPERTIES properties[] = {
+        CL_QUEUE_PRIORITY_KHR, 0/*placeholder filled-in below*/,
+        0 /* terminator */
+      };
+      properties[1] = (CL_QUEUE_PRIORITY_HIGH_KHR <= priority && CL_QUEUE_PRIORITY_LOW_KHR >= priority)
+        ? priority : ((CL_QUEUE_PRIORITY_HIGH_KHR + CL_QUEUE_PRIORITY_LOW_KHR) / 2);
+      result = acc_opencl_stream_create(&queue, name, properties);
+    }
+    else
+#endif
+    {
+      ACC_OPENCL_COMMAND_QUEUE_PROPERTIES properties[] = {
+        0 /* terminator */
+      };
+      result = acc_opencl_stream_create(&queue, name, properties);
+    }
+    assert(NULL != stream_p);
+    if (EXIT_SUCCESS == result) {
+      assert(NULL != queue);
+#if defined(ACC_OPENCL_STREAM_NOALLOC)
+      assert(sizeof(void*) >= sizeof(cl_command_queue));
+      *stream_p = (void*)queue;
+#else
+      *stream_p = malloc(sizeof(cl_command_queue));
+      if (NULL != *stream_p) {
+        *(cl_command_queue*)*stream_p = queue;
+      }
+      else {
+        clReleaseCommandQueue(queue);
+        result = EXIT_FAILURE;
+      }
+#endif
+    }
+    else {
+      *stream_p = NULL;
+    }
+  }
+  ACC_OPENCL_RETURN_CAUSE(result, name);
+}
+
+
+int acc_stream_destroy(void* stream)
+{
+  int result = EXIT_SUCCESS;
+  if (NULL != stream) {
+    ACC_OPENCL_CHECK(clReleaseCommandQueue(*ACC_OPENCL_STREAM(stream)),
+      "release command queue", result);
+#if defined(ACC_OPENCL_STREAM_NOALLOC)
+    assert(sizeof(void*) >= sizeof(cl_command_queue));
+#else
+    free(stream);
+#endif
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_stream_priority_range(int* least, int* greatest)
+{
+  int result = ((NULL != least || NULL != greatest) ? EXIT_SUCCESS : EXIT_FAILURE);
+  if (NULL != acc_opencl_context) {
+#if defined(ACC_OPENCL_STREAM_PRIORITIES) && defined(CL_QUEUE_PRIORITY_KHR)
+    char buffer[ACC_OPENCL_BUFFERSIZE];
+    cl_platform_id platform = NULL;
+    cl_device_id active_id = NULL;
+    assert(0 < acc_opencl_ndevices);
+    if (EXIT_SUCCESS == result) result = acc_opencl_device(NULL/*stream*/, &active_id);
+    ACC_OPENCL_CHECK(clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM,
+      sizeof(cl_platform_id), &platform, NULL),
+      "retrieve platform associated with active device", result);
+    ACC_OPENCL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS,
+      ACC_OPENCL_BUFFERSIZE, buffer, NULL),
+      "retrieve platform extensions", result);
+    if (EXIT_SUCCESS == result) {
+      if (NULL != strstr(buffer, "cl_khr_priority_hints")) {
+        if (NULL != least) *least = CL_QUEUE_PRIORITY_LOW_KHR;
+        if (NULL != greatest) *greatest = CL_QUEUE_PRIORITY_HIGH_KHR;
+      }
+      else
+#endif
+      {
+        if (NULL != least) *least = -1;
+        if (NULL != greatest) *greatest = -1;
+      }
+#if defined(ACC_OPENCL_STREAM_PRIORITIES) && defined(CL_QUEUE_PRIORITY_KHR)
+    }
+#endif
+  }
+  else {
+    if (NULL != least) *least = -1;
+    if (NULL != greatest) *greatest = -1;
+  }
+  assert(least != greatest); /* no alias */
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_stream_sync(void* stream)
+{ /* Blocks the host-thread. */
+  int result = EXIT_SUCCESS;
+  assert(NULL != stream);
+#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+  fprintf(stderr, "acc_stream_sync(%p)\n", stream);
+#endif
+  ACC_OPENCL_CHECK(clFinish(*ACC_OPENCL_STREAM(stream)),
+    "synchronize stream", result);
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int acc_stream_wait_event(void* stream, void* event)
+{ /* Wait for an event (device-side). */
+  int result = EXIT_SUCCESS;
+  assert(NULL != stream && NULL != event);
+#if defined(ACC_OPENCL_VERBOSE) && defined(_DEBUG)
+  fprintf(stderr, "acc_stream_wait_event(%p, %p)\n", stream, event);
+#endif
+#if defined(ACC_OPENCL_STREAM_SYNCFLUSH)
+  ACC_OPENCL_CHECK(clFlush(*ACC_OPENCL_STREAM(stream)), "flush stream", result);
+#endif
+  ACC_OPENCL_CHECK(ACC_OPENCL_WAIT_EVENT(*ACC_OPENCL_STREAM(stream), ACC_OPENCL_EVENT(event)),
+    "wait for an event", result);
+  ACC_OPENCL_RETURN(result);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*__OPENCL*/
diff --git a/src/acc/opencl/smm/.gitignore b/src/acc/opencl/smm/.gitignore
new file mode 100644
index 00000000000..dc6a8d940a0
--- /dev/null
+++ b/src/acc/opencl/smm/.gitignore
@@ -0,0 +1 @@
+opencl_kernels.h
diff --git a/src/acc/opencl/smm/CMakeLists.txt b/src/acc/opencl/smm/CMakeLists.txt
new file mode 100644
index 00000000000..37539399ba7
--- /dev/null
+++ b/src/acc/opencl/smm/CMakeLists.txt
@@ -0,0 +1,30 @@
+set(LIBSMM_ACC_SRC_FILES opencl_libsmm.c)
+
+set(LIBSMM_ACC_HEADER_KERNELS ${CMAKE_CURRENT_SOURCE_DIR}/opencl_kernels.h)
+set(LIBSMM_ACC_HEADER_FILES opencl_libsmm.h ${LIBSMM_ACC_HEADER_KERNELS})
+set(LIBSMM_ACC_FILES ${LIBSMM_ACC_SRC_FILES} ${LIBSMM_ACC_HEADER_FILES})
+
+set(SMM_ACC_KERNEL_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../acc_opencl.sh)
+set(SMM_ACC_KERNELS kernels/multiply.cl kernels/transpose.cl)
+list(TRANSFORM SMM_ACC_KERNELS PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+if (USE_ACCEL MATCHES "opencl")
+  add_custom_command(
+    COMMAND
+      ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS}
+      # parameter file is optional (does not need to exist)
+      ${CMAKE_CURRENT_SOURCE_DIR}/tune_multiply.csv ${LIBSMM_ACC_HEADER_KERNELS}
+    # param file is missing here (manual/clean rebuild needed)
+    DEPENDS ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS}
+    OUTPUT ${LIBSMM_ACC_HEADER_KERNELS}
+    COMMENT "libsmm_acc: generating kernels")
+  add_library(libsmm_acc OBJECT ${LIBSMM_ACC_FILES})
+  target_compile_definitions(libsmm_acc PRIVATE __OPENCL)
+  target_compile_definitions(libsmm_acc PRIVATE __LIBXSMM)
+  # account for DBCSR not calling libsmm_acc_init() (DBCSR only calls acc_init)
+  target_compile_definitions(libsmm_acc PRIVATE __DBCSR_ACC)
+  target_include_directories(libsmm_acc PRIVATE ${LIBXSMM_INCLUDE_DIRS})
+  if (OpenMP_FOUND)
+    target_link_libraries(libsmm_acc PRIVATE OpenMP::OpenMP_C)
+  endif ()
+endif ()
diff --git a/src/acc/opencl/smm/PACKAGE b/src/acc/opencl/smm/PACKAGE
new file mode 100644
index 00000000000..cbd0135e530
--- /dev/null
+++ b/src/acc/opencl/smm/PACKAGE
@@ -0,0 +1,5 @@
+{
+"description": "OpenCL-accelerated library for small matrix multiplications",
+"archive": "libdbcsr",
+"requires": ["..", "../.."]
+}
diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md
new file mode 100644
index 00000000000..1536bd14c04
--- /dev/null
+++ b/src/acc/opencl/smm/README.md
@@ -0,0 +1,117 @@
+# LIBSMM (OpenCL)
+
+## Overview
+
+The LIBSMM library implements the [ACC LIBSMM interface](https://github.com/cp2k/dbcsr/blob/develop/src/acc/acc_libsmm.h), and depends on the [OpenCL backend](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/README.md). At least the compile-time settings below are typically for development, e.g., when attempting to contribute new functionality or features, or meant for debug purpose (and not necessarily settings to be made when using DBCSR or CP2K).
+
+## Customization
+
+### Compile-time Settings
+
+Compile-time settings are (implicitly) documented and can be adjusted by editing [opencl_libsmm.h](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/opencl_libsmm.h) (adjusting the build-line as per `-D` is possible as well but less convenient). For example, `OPENCL_LIBSMM_F32` is enabled by default but can be disabled, or `OPENCL_LIBSMM_DEBUG` (which is disabled by default) can be enabled for debug purpose.
+
+The `OPENCL_LIBSMM_DEBUG` compile-time setting enables side-by-side validation of matrix transpose and multiply operations on GPU against a built-in CPU implementation. For example, running DBCSR's unit tests with this setting produces useful console output that allows to pin-point the exact call arguments causing a validation error.
+
+### Runtime Settings
+
+Runtime settings are made by the means of environment variables (implemented in `opencl_libsmm.c`). There are two categories (for the two major functions) like matrix transpose (`OPENCL_LIBSMM_TRANS_*`) and matrix multiplication (`OPENCL_LIBSMM_SMM_*`). For tranposing matrices:
+
+* `OPENCL_LIBSMM_TRANS_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler.
+* `OPENCL_LIBSMM_TRANS_INPLACE`: Boolean value (zero or non-zero integer) for inplace matrix transpose not relying on local memory.
+* `OPENCL_LIBSMM_TRANS_BLOCK_M`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction.
+
+For multiplying matrices:
+
+* `OPENCL_LIBSMM_SMM_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler.
+* `OPENCL_LIBSMM_SMM_ATOMICS`: selects the kind of atomic operation used for global memory updates ("cmpxchg", "xchg"), or disables atomic updates ("0"). The latter is to quantify the impact of atomic operations rather than for achieving correct results.
+* `OPENCL_LIBSMM_SMM_BATCHSIZE`: non-negative integer number denoting the intr-kernel (mini-)batchsize mainly used to amortize atomic updates of data in global/main memory. The remainder with respect to the "stacksize" is handled by the kernel.
+* `OPENCL_LIBSMM_SMM_BLOCK_M`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction.
+* `OPENCL_LIBSMM_SMM_BLOCK_N`: non-negative integer number (less/equal than the N-extent) denoting the blocksize in N-direction.
+
+**NOTE**: above runtime settings may be non-smooth in the sense of enabling a distinct code-path depending on a specific value, e.g., `OPENCL_LIBSMM_SMM_BATCHSIZE=1`.
+
+## Auto Tuning
+
+Auto tuning code for performance is a practical way to find the "best" setting for parameterized code (e.g., GPU kernels). Introducing effective parameters is a prerequisite, and exploring the (potentially) high-dimensional parameter space in an efficient way is an art. It is desirable to have reasonable defaults even without auto-tuning the parameters. It would be even better to avoid auto-tuning if best performance was possible right away, i.e., if auto-tuning is not able to find better settings.
+
+For the OpenCL based LIBSMM, `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` are explored using [OpenTuner](http://opentuner.org/). The script [tune_multiply.py](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.py) leverages for instance the [acc_bench_smm](index.html) benchmark by parsing console output (timing, data type, etc.). This way, the tuning is implemented without being intermingled with subject being tuned. To build the benchmarks:
+
+```bash
+cd src/acc/opencl
+make DBG=0
+```
+
+To auto-tune, please install the Python `wheel` and `opentuner` packages:
+
+```bash
+cd src/acc/opencl/smm
+pip install -r requirements.txt
+```
+
+The OpenTuner script supports several command line arguments (`tune_multiply.py --help`); defaults are reasonable with `--stop-after` of interest for adjustment, e.g., `--stop-after=300` to finish in five minutes (without limit, OpenTuner decides when the process is finished). A single kernel can be selected by M, N, and K parameters (GEMM), e.g., `M=15`, `N=5`, and `K=7`:
+
+```bash
+./tune_multiply.py 13 5 7
+```
+
+**NOTE**: If multiple different kernels are tuned using `tune_multiply.py`, it is advisible to delete the `opentuner.db` directory prior to a new kernel otherwise auto-tuning is potentially (mis-)guided by information which was collected for a different kernel (`tune_multiply.sh` does this automatically).
+
+The OpenTuner script implements multiple objectives ("cost"), primarily "accuracy" (maximized) and a secondary objective "size" (minimized). The former represents the achieved performance (GFLOPS/s) while the latter represents an artificial kernel requirement (just to prefer one parameter set over another in case of similar performance). The console output looks like:
+
+```
+[    15s]    INFO opentuner.search.plugin.DisplayPlugin: tests=8, best {'BS': 32, 'BM': 6, 'BN': 1}, cost accuracy=28.80000000, size=1.0, found by UniformGreedyMutation
+[    27s]    INFO opentuner.search.plugin.DisplayPlugin: tests=19, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation
+[    40s]    INFO opentuner.search.plugin.DisplayPlugin: tests=31, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation
+[    54s]    INFO opentuner.search.plugin.DisplayPlugin: tests=43, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation
+[    67s]    INFO opentuner.search.plugin.DisplayPlugin: tests=53, best {'BS': 48, 'BM': 8, 'BN': 1}, cost accuracy=32.20000000, size=1.0, found by UniformGreedyMutation
+```
+
+The script finally writes a JSON-file with a filename like `tune_multiply-float-12x12x12-60gflops.json` which is encoding the benchmark (multiply), the precision (float), the kernel (12x12x12), and the achieved performance (60gflops). Tuninig starts from an internal default that is supposed to match LIBSMM's internal default parameters. However, tuning can be (re-)started with specific parameters (e.g., `-bs 64`, `-bm 13`, `-bn 1` for `OPENCL_LIBSMM_SMM_BATCHSIZE`, `OPENCL_LIBSMM_SMM_BLOCK_M`, and `OPENCL_LIBSMM_SMM_BLOCK_N` respectively).
+
+## Optimized Kernels
+
+JSON-files in the above mentioned smm-directory are automatically summarized into a CSV-file (can be disabled). Parameters achieved with single-precision (SP) and double-precision (DP) can be safely combined. However, care must be taken to not summarize unrelated results like for different devices or after (major) kernel changes. The CSV-file contains a header-row with column names, and the content is automatically incorporated into LIBSMM by the next clean (re-)build.
+
+```bash
+cd src/acc/opencl
+make realclean
+make DBG=0
+```
+
+This way auto-tuned kernels just work and can be of course exercised using the afore mentioned benchmark:
+
+```bash
+cd src/acc
+./acc_bench_smm 5 30000 13 5 7
+```
+
+Tuned parameters can be also disabled at runtime like:
+
+```bash
+cd src/acc
+OPENCL_LIBSMM_SMM_PARAMS=0 ./acc_bench_smm 5 30000 13 5 7
+```
+
+Further, a CSV-file can be supplied to override embedded parameters or defaults:
+
+```bash
+cd src/acc
+OPENCL_LIBSMM_SMM_PARAMS=opencl/smm/tune_multiply.csv ./acc_bench_smm 5 30000 13 5 7
+```
+
+To tune multiple kernels in a convenient fashion, a triplet specification can be supplied to the [tune_multiply.sh](https://github.com/cp2k/dbcsr/blob/develop/src/acc/opencl/smm/tune_multiply.sh) wrapper script. This script estimates the total runtime for auto-tuning kernels, cleans up intermediate results (`opentuner.db`), allows to specify triplets, and to split work in order to auto-tune in parallel.
+
+Triplets are used to conveniently describe multiple kernels. A triplet specification consists of comma-separated groups of M,N,K-extents, i.e., matrix shapes according to GEMM. For example:
+
+```
+4 10 15, 6 7 8, 23
+```
+
+This triplet specification expands to 55 kernels using the Cartesian product, concatenating the triplets from all expanded groups by combining all values within a comma-separated group. Further, the wrapper script allows to limit the time spent for tuning a single kernel and to partition the amount of kernels to be tuned, e.g., among a cluster of eight systems (below the first partition out of eight would be procesed with five minutes per kernel and about 35 minutes in total per partition).
+
+```bash
+cd src/acc/opencl/smm
+./tune_multiply.sh 300  8 1  4 10 15, 6 7 8, 23
+```
+
+The script `tune_multiply.sh` is tuning 1444 kernels by default (`./acc_bench_smm 300 8 1` taking approximately 15 hours per part).
diff --git a/src/acc/opencl/smm/kernels/multiply.cl b/src/acc/opencl/smm/kernels/multiply.cl
new file mode 100644
index 00000000000..949bac5c012
--- /dev/null
+++ b/src/acc/opencl/smm/kernels/multiply.cl
@@ -0,0 +1,173 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+
+/* number of M-blocks */
+#define NBM ((SM + BM - 1) / BM)
+/* number of N-blocks */
+#define NBN ((SN + BN - 1) / BN)
+/* size of workgroup (WG) */
+#define SWG (NBM * NBN)
+
+
+__attribute__((always_inline))
+inline void atomic_add_global_cmpxchg(global volatile T* dst, T inc)
+{
+  union { TA a; T f; } old_val, try_val, new_val = { .f = *dst };
+  do {
+    old_val.a = new_val.a;
+    try_val.f = old_val.f + inc;
+    new_val.a = CMPXCHG((global volatile TA*)dst, old_val.a, try_val.a);
+  } while (old_val.a != new_val.a);
+}
+
+
+__attribute__((always_inline))
+inline void atomic_add_global_xchg(global volatile T* dst, T inc)
+{
+  union { TA a; T f; } old_val = { .f = inc }, try_val, new_val = { .f = 0 };
+  do {
+    try_val.a = XCHG((global volatile TA*)dst, new_val.a);
+    try_val.f += old_val.f;
+    old_val.a = XCHG((global volatile TA*)dst, try_val.a);
+  } while (old_val.a != new_val.a);
+}
+
+
+kernel void FN(global T *restrict cmat,
+  GLOBAL const T *restrict amat, GLOBAL const T *restrict bmat,
+#if (1 < BS)
+  GLOBAL const int *restrict param_stack, int stack_size)
+#else
+  GLOBAL const int *restrict param_stack)
+#endif
+{
+  const int gid = get_group_id(0), idx = get_local_id(0);
+  GLOBAL const int *const restrict params = param_stack + gid * (3 * BS);
+  /* indexes given by param_stack are one-based */
+  int a0 = params[0] - 1, b0 = params[1] - 1, c0 = params[2] - 1;
+  global T *restrict cwg = cmat + c0;
+
+  local T a[SM][SK];
+  T am[SK], bn[SK];
+#if (SWG != SN)
+  local T b[SK][SN];
+# if (1 < BS)
+  T c[BM][BN] = {{ 0 }};
+# endif
+#else
+# if (1 < BS)
+  T c[SM] = { 0 };
+# endif
+#endif
+
+  /* intra-kernel mini-batch of SMMs */
+#if (1 < BS)
+  const int batchsize = min(BS, stack_size - BS * gid);
+  for (int i = 0; i < batchsize; ++i)
+#endif
+  {
+#if (SWG != SN)
+    const int im = idx / NBN;
+    const int m0 = im * BM, m1 = min(m0 + BM, SM);
+    const int n0 = (idx - im * NBN) * BN;
+    const int n1 = min(n0 + BN, SN);
+#else
+    const int bm = (SM + SWG - 1) / SWG;
+    const int m0 = idx * bm, m1 = min(m0 + bm, SM);
+    const int n = idx;
+#endif
+
+#if (1 < BS)
+    int a1, b1, c1;
+    if (i < (batchsize - 1)) {
+      a1 = params[3*i+3] - 1;
+      b1 = params[3*i+4] - 1;
+      c1 = params[3*i+5] - 1;
+    }
+    else {
+      a1 = b1 = c1 = -1;
+    }
+#endif
+
+    { /* transpose A-matrix into local buffer */
+      GLOBAL const T *const restrict awg = amat + a0;
+      for (int m = m0; m < m1; ++m) {
+        for (int k = 0; k < SK; ++k) a[m][k] = awg[SM*k+m];
+      }
+#if (1 < BS)
+      a0 = a1; /* next iteration */
+#endif
+    }
+
+    { /* copy B-matrix into local or private buffer */
+      GLOBAL const T *const restrict bwg = bmat + b0;
+      for (int k = 0; k < SK; ++k) {
+#if (SWG != SN)
+        for (int n = n0; n < n1; ++n) b[k][n] = bwg[SN*k+n];
+#else
+        bn[k] = bwg[SN*k+n];
+#endif
+      }
+#if (1 < BS)
+      b0 = b1; /* next iteration */
+#endif
+    }
+
+    { /* calculate private result-tile */
+      barrier(CLK_LOCAL_MEM_FENCE);
+#if (SWG != SN)
+      for (int m = m0; m < m1; ++m) {
+        for (int k = 0; k < SK; ++k) am[k] = a[m][k];
+        for (int n = n0; n < n1; ++n) {
+          T r = 0;
+          for (int k = 0; k < SK; ++k) bn[k] = b[k][n];
+          for (int k = 0; k < SK; ++k) r = FMA(am[k], bn[k], r);
+# if (1 < BS)
+          c[m-m0][n-n0] += r;
+# else
+          ATOMIC_ADD_GLOBAL(&cwg[SM*n+m], r);
+# endif
+        }
+      }
+#else
+      for (int m = 0; m < SM; ++m) {
+        T r = 0;
+        for (int k = 0; k < SK; ++k) am[k] = a[m][k];
+        for (int k = 0; k < SK; ++k) r = FMA(am[k], bn[k], r);
+# if (1 < BS)
+        c[m] += r;
+# else
+        ATOMIC_ADD_GLOBAL(&cwg[SM*n+m], r);
+# endif
+      }
+#endif
+    }
+
+#if (1 < BS)
+    if (c0 != c1) { /* copy private tile to global memory */
+# if (SWG != SN)
+      for (int m = m0; m < m1; ++m) for (int n = n0; n < n1; ++n) {
+        T *const restrict r = &c[m-m0][n-n0];
+        ATOMIC_ADD_GLOBAL(&cwg[SM*n+m], *r);
+        *r = 0; /* reset */
+      }
+# else
+      for (int m = 0; m < SM; ++m) {
+        ATOMIC_ADD_GLOBAL(&cwg[SM*n+m], c[m]);
+        c[m] = 0; /* reset */
+      }
+# endif
+      /* next iteration */
+      cwg = cmat + c1;
+      c0 = c1;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+  }
+}
diff --git a/src/acc/opencl/smm/kernels/transpose.cl b/src/acc/opencl/smm/kernels/transpose.cl
new file mode 100644
index 00000000000..55c4a1faa2f
--- /dev/null
+++ b/src/acc/opencl/smm/kernels/transpose.cl
@@ -0,0 +1,66 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+
+kernel void FN(GLOBAL const int *restrict trs_stack, int trs_offset, global T *restrict matrix)
+{
+  /* offset in the transpose-stack that this block ID should handle */
+  const int offset = trs_stack[trs_offset+get_group_id(0)];
+  /* matrix according to the index (transpose-stack) */
+  global T *const restrict mat = matrix + offset;
+  const int index = get_local_id(0);
+#if (SM != SN) || (0 == INPLACE)
+  /* local memory buffer */
+  local T buf[SM*SN];
+#endif
+
+#if (SWG == SM)
+  const int m = index;
+# if (SM != SN) || (0 == INPLACE)
+  /* copy matrix elements into local buffer */
+  for (int n = 0; n < SN; ++n) buf[SN*m+n] = mat[SN*m+n];
+  barrier(CLK_LOCAL_MEM_FENCE);
+  /* overwrite matrix elements (gather) */
+  for (int n = 0; n < SN; ++n) mat[SN*m+n] = buf[SM*n+m];
+# else
+  for (int n = 0; n < m; ++n) {
+    const int i = SM * n + m;
+    const int j = SN * m + n;
+    const T tmp = mat[i];
+    mat[i] = mat[j];
+    mat[j] = tmp;
+  }
+# endif
+#else
+  T prv[SN]; /* private buffer */
+  const int msize = (SM + SWG - 1) / SWG;
+  const int m0 = index * msize, m1 = min(m0 + msize, SM);
+# if (SM != SN) || (0 == INPLACE)
+  /* copy matrix elements into local buffer */
+  for (int m = m0; m < m1; ++m) {
+    for (int n = 0; n < SN; ++n) buf[SN*m+n] = mat[SN*m+n];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+# endif
+  for (int m = m0; m < m1; ++m) {
+# if (SM != SN) || (0 == INPLACE)
+    for (int n = 0; n < SN; ++n) prv[n] = buf[SM*n+m];
+    /* overwrite matrix elements (gather) */
+    for (int n = 0; n < SN; ++n) mat[SN*m+n] = prv[n];
+# else
+    for (int n = 0; n < SN; ++n) prv[n] = mat[SM*n+m];
+    for (int n = 0; n < m; ++n) {
+      const int i = SM * n + m;
+      const int j = SN * m + n;
+      mat[i] = mat[j];
+      mat[j] = prv[n];
+    }
+# endif
+  }
+#endif
+}
diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c
new file mode 100644
index 00000000000..e0a976adae2
--- /dev/null
+++ b/src/acc/opencl/smm/opencl_libsmm.c
@@ -0,0 +1,748 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#if defined(__OPENCL)
+#include "opencl_libsmm.h"
+/* Header opencl_kernels.h is generated by the build system using acc_opencl.sh */
+#include "opencl_kernels.h"
+#include <libxsmm_sync.h>
+#include <assert.h>
+#if defined(_OPENMP)
+# include <omp.h>
+#endif
+
+#if LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
+    LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 808 <= LIBXSMM_VERSION_PATCH
+# define OPENCL_LIBSMM_REGISTER(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT) \
+    libxsmm_xregister(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT, NULL/*key_hash*/)
+# define OPENCL_LIBSMM_DISPATCH(KEY, KEY_SIZE) \
+    libxsmm_xdispatch(KEY, KEY_SIZE, NULL/*key_hash*/)
+#else
+# define OPENCL_LIBSMM_REGISTER(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT) \
+    libxsmm_xregister(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT)
+# define OPENCL_LIBSMM_DISPATCH(KEY, KEY_SIZE) \
+    libxsmm_xdispatch(KEY, KEY_SIZE)
+#endif
+
+#if !defined(OPENCL_LIBSMM_DEBUG_TRANS) && defined(OPENCL_LIBSMM_DEBUG)
+# define OPENCL_LIBSMM_DEBUG_TRANS
+#endif
+#if !defined(OPENCL_LIBSMM_DEBUG_SMM) && defined(OPENCL_LIBSMM_DEBUG)
+# define OPENCL_LIBSMM_DEBUG_SMM
+#endif
+#if !defined(OPENCL_LIBSMM_NLOCKS_TRANS)
+# define OPENCL_LIBSMM_NLOCKS_TRANS 16
+#endif
+#if !defined(OPENCL_LIBSMM_NLOCKS_SMM)
+# define OPENCL_LIBSMM_NLOCKS_SMM 16
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int opencl_libsmm_initialized;
+volatile int opencl_libsmm_lock_trans[OPENCL_LIBSMM_NLOCKS_TRANS];
+volatile int opencl_libsmm_lock_smm[OPENCL_LIBSMM_NLOCKS_SMM];
+
+
+int opencl_libsmm_use_cmem(cl_device_id device)
+{
+#if defined(OPENCL_LIBSMM_CMEM)
+  int result = EXIT_SUCCESS;
+  cl_ulong size_maxalloc = 1, size_maxcmem = 0;
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+    sizeof(cl_ulong), &size_maxalloc, NULL), "retrieve maximum size of memory allocation", result);
+  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+    sizeof(cl_ulong), &size_maxcmem, NULL), "retrieve maximum size of constant buffer", result);
+  return (EXIT_SUCCESS == result ? (size_maxalloc <= size_maxcmem ? EXIT_SUCCESS : EXIT_FAILURE) : result);
+#else
+  ACC_OPENCL_UNUSED(device);
+  return EXIT_FAILURE;
+#endif
+}
+
+
+#if defined(OPENCL_LIBSMM_DEBUG) && defined(_DEBUG)
+void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n)
+{
+  int i, j;
+  const char *const s = (NULL != label ? label : "");
+  const int len = (int)strlen(s);
+  for (i = 0; i < m; ++i) {
+    if (0 < i) fprintf(ostream, "%*s", len, " "); else fprintf(ostream, "%s", s);
+    for (j = 0; j < n; ++j) {
+      switch (type) {
+        case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((double*)mat)[i*n+j]); break;
+        case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((float*) mat)[i*n+j]); break;
+        default: fprintf(ostream, "? ");
+      }
+    }
+    fprintf(ostream, "\n");
+  }
+}
+#endif
+
+
+int opencl_libsmm_read_params(char* parambuf,
+  opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value)
+{
+  const char* s = strtok(parambuf, OPENCL_LIBSMM_PARAMS_DELIMS);
+  int consumed = 0, t = 0, i;
+  assert(NULL != key && NULL != value);
+  for (; NULL != s; s = strtok(NULL, OPENCL_LIBSMM_PARAMS_DELIMS), ++t) {
+    switch (t) {
+      case 0: if (1 == sscanf(s, "%i", &i)) {
+        key->type = (libsmm_acc_data_t)i; ++consumed;
+      } break;
+      case 1: if (1 == sscanf(s, "%i", &i)) {
+        key->m = i; ++consumed;
+      } break;
+      case 2: if (1 == sscanf(s, "%i", &i)) {
+        key->n = i; ++consumed;
+      } break;
+      case 3: if (1 == sscanf(s, "%i", &i)) {
+        key->k = i; ++consumed;
+      } break;
+      case 5: if (1 == sscanf(s, "%i", &i)) {
+        value->bs = i; ++consumed;
+      } break;
+      case 6: if (1 == sscanf(s, "%i", &i)) {
+        value->bm = i; ++consumed;
+      } break;
+      case 7: if (1 == sscanf(s, "%i", &i)) {
+        value->bn = i; ++consumed;
+      } break;
+    }
+  }
+  return (7 == consumed ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+
+int libsmm_acc_init(void)
+{
+#if defined(_OPENMP)
+    /* initialization/finalization is not meant to be thread-safe */
+  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+#else
+  int result = EXIT_SUCCESS;
+#endif
+  /* multiple calls to libsmm_acc_init are not considered as an error */
+  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) {
+#if !defined(__DBCSR_ACC)
+    /* DBCSR shall call acc_init as well as libsmm_acc_init (since both interfaces are used).
+     * Also, libsmm_acc_init may privately call acc_init (as it depends on the ACC interface).
+     * The implementation of acc_init should hence be safe against "over initialization".
+     * However, DBCSR only calls acc_init (and expects an implicit libsmm_acc_init).
+     */
+    if (EXIT_SUCCESS == result) {
+      result = acc_init();
+    }
+#endif
+    if (EXIT_SUCCESS == result) {
+      char buffer[ACC_OPENCL_BUFFERSIZE];
+      const char *const env_params = getenv("OPENCL_LIBSMM_SMM_PARAMS");
+      opencl_libsmm_smm_t config;
+      opencl_libsmm_smmkey_t key;
+      /* zeroing config once (tuned parameters are setup below) */
+      LIBXSMM_MEMZERO127(&config);
+      /* potentially heterogeneous key-data */
+      LIBXSMM_MEMZERO127(&key);
+      if (NULL == env_params || '0' != *env_params) {
+        if (NULL != env_params && '\0' != *env_params) {
+          FILE *const file = fopen(env_params, "r");
+          /* consume first line and skip CSV header line */
+          if (NULL == file || NULL == fgets(buffer, ACC_OPENCL_BUFFERSIZE, file)) {
+            result = EXIT_FAILURE;
+          }
+          while (EXIT_SUCCESS == result &&
+            NULL != fgets(buffer, ACC_OPENCL_BUFFERSIZE, file))
+          {
+            result = opencl_libsmm_read_params(buffer, &key, &config);
+            if (EXIT_SUCCESS == result &&
+              NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config))
+            {
+              result = EXIT_FAILURE;
+            }
+          }
+        }
+#if defined(OPENCL_LIBSMM_PARAMS_SMM)
+        else {
+          const char* line = OPENCL_LIBSMM_PARAMS_SMM, *next;
+          do {
+            next = strchr(line, '\n');
+            if (NULL != next && next < (line + ACC_OPENCL_BUFFERSIZE)) {
+              const int len = next - line;
+              memcpy(buffer, line, len); buffer[len] = '\0';
+              result = opencl_libsmm_read_params(buffer, &key, &config);
+              if (EXIT_SUCCESS == result &&
+                NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config))
+              {
+                result = EXIT_FAILURE; break;
+              }
+              line = ++next;
+            }
+          } while (NULL != next);
+        }
+#endif
+      }
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int libsmm_acc_finalize(void)
+{
+  /* Routine libsmm_acc_init is called in master thread inside of parallel region
+   * However, libsmm_acc_finalize is indirectly called (acc_finalize) inside of a
+   * parallel region (not just the master thread).
+   */
+#if defined(_OPENMP) && /*WORKAROUND*/!defined(__DBCSR_ACC)
+  /* initialization/finalization is not meant to be thread-safe */
+  int result = (0 == omp_in_parallel() ? EXIT_SUCCESS : EXIT_FAILURE);
+#else
+  int result = EXIT_SUCCESS;
+#endif
+#if 0
+  /* multiple calls to libsmm_acc_finalize are not considered as an error */
+  if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) {
+  }
+#endif
+  /* acc_finalize is not called since it can be used independently */
+  return result;
+}
+
+
+acc_bool_t libsmm_acc_is_thread_safe(void)
+{
+  /* match DBCSR's threading level */
+#if defined(_OPENMP)
+  return 1;
+#else
+  return 0;
+#endif
+}
+
+
+int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
+  void* dev_data, libsmm_acc_data_t datatype, int m, int n, int max_kernel_dim, void* stream)
+{
+  const int mn = m * n;
+  int result = EXIT_SUCCESS;
+  assert((NULL != dev_trs_stack && NULL != dev_data && 0 <= offset && 0 <= stack_size) || 0 == stack_size);
+  if ((
+#if defined(OPENCL_LIBSMM_F64)
+      dbcsr_type_real_8 == datatype
+#else
+      0
+#endif
+      ||
+#if defined(OPENCL_LIBSMM_F32)
+      dbcsr_type_real_4 == datatype
+#else
+      0
+#endif
+    ) &&
+    0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim)
+  {
+    opencl_libsmm_trans_t* config;
+    opencl_libsmm_transkey_t key;
+    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
+    key.type = datatype; key.m = m; key.n = n; /* initialize key */
+    config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_DISPATCH(&key, sizeof(key));
+    if (NULL == config) {
+      char build_options[ACC_OPENCL_BUFFERSIZE], fname[32];
+      int nchar = ACC_OPENCL_SNPRINTF(fname, sizeof(fname), "xtrans%ix%i", m, n);
+      if (0 < nchar && (int)sizeof(fname) > nchar) {
+        cl_device_id active_device;
+        result = acc_opencl_device(stream, &active_device);
+        if (EXIT_SUCCESS == result) {
+          const char *const env_options = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS");
+          const char *const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE");
+          const char *const env_blockm = getenv("OPENCL_LIBSMM_TRANS_BLOCK_M");
+          const int inplace = ((m == n) && ((NULL == env_inplace || '\0' == *env_inplace)
+#if defined(OPENCL_LIBSMM_TRANS_INPLACE)
+            ? 1 : ('0' != *env_inplace)));
+#else
+            ? 0 : ('0' != *env_inplace)));
+#endif
+          const int blockm = ((NULL == env_blockm || '\0' == *env_blockm)
+            ? m/*TODO*/ : atoi(env_blockm));
+          const int bm = LIBXSMM_CLMP(blockm, 1, m);
+          const char* typename = "";
+          int wgsize;
+          switch (datatype) {
+            case dbcsr_type_real_8: {
+              typename = "char8"; /* double */
+              fname[0] = 'd';
+            } break;
+            case dbcsr_type_real_4: {
+              typename = "float";
+              fname[0] = 's';
+            } break;
+            default: ;
+          }
+          wgsize = ((m == bm || 0 == (m % bm)) ? bm : m);
+          nchar = ACC_OPENCL_SNPRINTF(build_options, sizeof(build_options), "%s"
+            " -DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s",
+            (NULL == env_options || '\0' == *env_options) ? "" : env_options,
+            EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant",
+            inplace, fname, m, n, wgsize, typename);
+          if ('\0' != *typename && 0 < nchar && (int)sizeof(build_options) > nchar) {
+            opencl_libsmm_trans_t new_config;
+#if defined(OPENCL_LIBSMM_SOURCE_TRANSPOSE)
+            result = acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE,
+              build_options, fname, &new_config.kernel);
+#else
+            ACC_OPENCL_UNUSED(inplace);
+            result = EXIT_FAILURE;
+#endif
+            if (EXIT_SUCCESS == result) {
+              int max_wgsize;
+              result = acc_opencl_wgsize(active_device, new_config.kernel,
+                &max_wgsize, NULL/*preferred_multiple*/);
+              if (EXIT_SUCCESS == result) {
+                assert(0 < max_wgsize);
+                if (wgsize <= max_wgsize) {
+                  new_config.wgsize = (size_t)wgsize;
+                  config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_REGISTER(&key, sizeof(key),
+                    sizeof(new_config), &new_config);
+                }
+                else result = EXIT_FAILURE;
+              }
+            }
+          }
+          else {
+            result = EXIT_FAILURE;
+          }
+        }
+      }
+      else {
+        result = EXIT_FAILURE;
+      }
+    }
+    assert((NULL != config && NULL != config->kernel && 0 < config->wgsize) || EXIT_SUCCESS != result);
+    if (EXIT_SUCCESS == result) {
+      const size_t work_size = config->wgsize * stack_size;
+#if defined(OPENCL_LIBSMM_DEBUG_TRANS)
+      const int offset_stack_size = offset + stack_size;
+      int *const stack = (int*)libxsmm_aligned_scratch(sizeof(int) * offset_stack_size, 0/*auto-align*/);
+      char *imat = NULL, *omat = NULL, *gold = NULL;
+      const int typesize = (dbcsr_type_real_8 == datatype ? 8
+        : (dbcsr_type_real_4 == datatype ? 4 : 0/*unknown*/));
+      size_t data_size;
+      if (NULL != stack && CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_data),
+        CL_MEM_SIZE, sizeof(size_t), &data_size, NULL))
+      {
+        imat = (char*)libxsmm_aligned_scratch(data_size, 0/*auto-align*/);
+        omat = (char*)libxsmm_aligned_scratch(data_size, 0/*auto-align*/);
+        gold = (char*)libxsmm_aligned_scratch(mn * typesize, 0/*auto-align*/);
+        if (NULL != imat && NULL != omat && NULL != gold) {
+          ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream),
+            "transfer debug stack", result);
+          ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_data, imat, data_size, stream),
+            "transfer debug input", result);
+        }
+        else result = EXIT_FAILURE;
+      }
+      else result = EXIT_FAILURE;
+#endif
+      assert(!(OPENCL_LIBSMM_NLOCKS_TRANS & (OPENCL_LIBSMM_NLOCKS_TRANS - 1))); /* POT */
+      { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */
+        const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975/*seed*/);
+        volatile int *const lock = opencl_libsmm_lock_trans + LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_TRANS);
+        LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 0, sizeof(cl_mem), ACC_OPENCL_MEM(dev_trs_stack)),
+          "set batch-list argument of transpose kernel", result);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 1, sizeof(int), &offset),
+          "set offset argument of transpose kernel", result);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_data)),
+          "set matrix-data argument of transpose kernel", result);
+        ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(*ACC_OPENCL_STREAM(stream),
+          config->kernel, 1/*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, NULL),
+          "launch transpose kernel", result);
+        LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED);
+      }
+#if defined(OPENCL_LIBSMM_DEBUG_TRANS)
+      ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_data, omat, data_size, stream),
+        "transfer debug test", result);
+#endif
+#if defined(OPENCL_LIBSMM_DEBUG_TRANS) || defined(OPENCL_LIBSMM_SYNC)
+      ACC_OPENCL_CHECK(acc_stream_sync(stream), "sync stream", result);
+#endif
+#if defined(OPENCL_LIBSMM_DEBUG_TRANS)
+      if (EXIT_SUCCESS == result) {
+        int i, j;
+        fprintf(stderr, "libsmm_acc_transpose("
+          "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
+          dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
+          m, n, max_kernel_dim, stream);
+        for (i = offset; i < offset_stack_size; ++i) {
+          const size_t index = stack[i];
+          const char *const orig = imat + index * typesize;
+          const char *const test = omat + index * typesize;
+          assert((index * typesize) < data_size);
+          memcpy(gold, orig, mn * typesize);
+          libxsmm_itrans(gold, typesize, m, n, m, n);
+          if (0 != memcmp(gold, test, mn * typesize)) {
+            fprintf(stderr, " => ERROR\n");
+# if defined(_DEBUG)
+            opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
+            opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
+            opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
+            fprintf(stderr, "\n");
+# endif
+            result = EXIT_FAILURE; break;
+          }
+          for (j = offset; j < i; ++j) {
+            const size_t duplicate = stack[j];
+            if (index == duplicate) {
+              fprintf(stderr, " => ERROR\n");
+              result = EXIT_FAILURE;
+              i = offset_stack_size;
+              break;
+            }
+          }
+        }
+        if (EXIT_SUCCESS == result) fprintf(stderr, " => OK\n");
+      }
+      libxsmm_free(stack);
+      libxsmm_free(imat);
+      libxsmm_free(omat);
+      libxsmm_free(gold);
+#endif
+    }
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+
+int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size,
+  int nparams, libsmm_acc_data_t datatype, const void* dev_a_data, const void* dev_b_data, void* dev_c_data,
+  int m_max, int n_max, int k_max, int max_kernel_dim, acc_bool_t def_mnk, void* stream, void* c_stream)
+{
+  int result = EXIT_SUCCESS;
+  ACC_OPENCL_UNUSED(c_stream); /* TODO */
+  assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack
+    && NULL != dev_a_data && NULL != dev_b_data && NULL != dev_c_data));
+  assert(0 < nparams && 0 < max_kernel_dim && NULL != stream);
+  assert(0 <= stack_size && 0 <= m_max && 0 <= n_max && 0 <= k_max);
+  if ((
+#if defined(OPENCL_LIBSMM_F64)
+      dbcsr_type_real_8 == datatype
+#else
+      0
+#endif
+      ||
+#if defined(OPENCL_LIBSMM_F32)
+      dbcsr_type_real_4 == datatype
+#else
+      0
+#endif
+    ) &&
+    0 < stack_size && def_mnk/*homogeneous*/ &&
+    0 < m_max && m_max <= max_kernel_dim &&
+    0 < n_max && n_max <= max_kernel_dim &&
+    0 < k_max && k_max <= max_kernel_dim)
+  {
+    opencl_libsmm_smm_t* config;
+    opencl_libsmm_smmkey_t key;
+    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
+    key.type = datatype; key.m = m_max; key.n = n_max; key.k = k_max; /* initialize key */
+    config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_DISPATCH(&key, sizeof(key));
+    if (NULL == config || NULL == config->kernel) {
+      char build_options[ACC_OPENCL_BUFFERSIZE], fname[48];
+      int nchar = ACC_OPENCL_SNPRINTF(fname, sizeof(fname), "xmm%ix%ix%i", m_max, n_max, k_max);
+      const char* extensions = NULL;
+      if (0 < nchar && (int)sizeof(fname) > nchar) {
+        cl_device_id active_device;
+        result = acc_opencl_device(stream, &active_device);
+        if (EXIT_SUCCESS == result) {
+          const char *atomic_cmpxchg = NULL, *atomic_xchg = NULL;
+          const char *atomic_type = NULL, *typename = NULL;
+          assert(NULL != active_device);
+          switch (datatype) {
+            case dbcsr_type_real_8: {
+              extensions = "cl_khr_fp64 cl_khr_int64_base_atomics";
+              if (EXIT_SUCCESS == acc_opencl_device_ext(active_device, &extensions, 1)) {
+                atomic_cmpxchg = "atom_cmpxchg";
+                atomic_xchg = "atom_xchg";
+                atomic_type = "long";
+                typename = "double";
+                fname[0] = 'd';
+              }
+            } break;
+            case dbcsr_type_real_4: {
+              extensions = "cl_khr_global_int32_base_atomics";
+              if (EXIT_SUCCESS == acc_opencl_device_ext(active_device, &extensions, 1)) {
+                atomic_cmpxchg = "atomic_cmpxchg";
+                atomic_xchg = "atomic_xchg";
+                atomic_type = "int";
+                typename = "float";
+                fname[0] = 's';
+              }
+            } break;
+            default: ;
+          }
+          if (NULL != typename) {
+            int max_wgsize, wgsize, bs, bm, bn, nbm, nbn;
+            result = acc_opencl_wgsize(active_device, NULL/*device-specific*/,
+              &max_wgsize, NULL/*preferred_multiple*/);
+            if (EXIT_SUCCESS == result) {
+              const char *const env_batchsize = getenv("OPENCL_LIBSMM_SMM_BATCHSIZE");
+              const char *const env_blockm = getenv("OPENCL_LIBSMM_SMM_BLOCK_M");
+              const char *const env_blockn = getenv("OPENCL_LIBSMM_SMM_BLOCK_N");
+              /* TODO: load parameters from file (auto-tuned) */
+              const int batchsize = ((NULL == env_batchsize || '\0' == *env_batchsize)
+                ? (NULL == config ? 32/*default*/ : config->bs) : atoi(env_batchsize));
+              const int blockm = ((NULL == env_blockm || '\0' == *env_blockm)
+                ? (NULL == config ? m_max/*default*/ : config->bm) : atoi(env_blockm));
+              const int blockn = ((NULL == env_blockn || '\0' == *env_blockn)
+                ? (NULL == config ? 1/*default*/ : config->bn) : atoi(env_blockn));
+              bm = LIBXSMM_CLMP(blockm, 1, m_max);
+              bn = LIBXSMM_CLMP(blockn, 1, n_max);
+              bs = LIBXSMM_MAX(batchsize, 1);
+              nbm = (m_max + bm - 1) / bm;
+              nbn = (n_max + bn - 1) / bn;
+              wgsize = nbm * nbn;
+              assert(1 <= bs && 0 < wgsize && 0 < max_wgsize);
+              /* limit WG-size to device's maximum WG-size */
+              while (max_wgsize < wgsize && (bm < m_max || bn < n_max)) {
+                if (bn < n_max) {
+                  ++bn; nbn = (n_max + bn - 1) / bn;
+                }
+                else if (bm < m_max) {
+                  ++bm; nbm = (m_max + bm - 1) / bm;
+                }
+                wgsize = nbm * nbn;
+              }
+              if (wgsize <= max_wgsize) { /* SMMs can be potentially handled by device */
+                const char *const env_options = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
+                const char *const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS");
+                const char *atomics = NULL;
+                if (NULL == env_atomics || '0' != *env_atomics) {
+                  if ((NULL == env_atomics && EXIT_SUCCESS != acc_opencl_device_vendor(active_device, "nvidia"))
+                    || NULL != acc_opencl_stristr(env_atomics, "cmpxchg"))
+                  {
+                    atomics = "atomic_add_global_cmpxchg(A,B)";
+                  }
+                  else {
+                    atomics = "atomic_add_global_xchg(A,B)";
+                  }
+                }
+                else {
+                  atomics = "*(A)+=(B)";
+                }
+                assert(1 <= bs && 0 < bm && 0 < bn && NULL != atomics);
+                nchar = ACC_OPENCL_SNPRINTF(build_options, sizeof(build_options),
+                  "%s -cl-fast-relaxed-math -cl-no-signed-zeros -cl-denorms-are-zero"
+                  " -DGLOBAL=%s -DFN=%s -DSM=%i -DSN=%i -DSK=%i -DBM=%i -DBN=%i -DBS=%i"
+                  " -DT=%s -DTA=\"%s\" -DFMA=fma -DCMPXCHG=%s -DXCHG=%s"
+                  " -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\"",
+                  (NULL == env_options || '\0' == *env_options) ? "" : env_options,
+                  EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant",
+                  fname, m_max, n_max, k_max, bm, bn, bs, typename,
+                  atomic_type, atomic_cmpxchg, atomic_xchg, atomics);
+                if (0 >= nchar || (int)sizeof(build_options) <= nchar) result = EXIT_FAILURE;
+              }
+              else {
+                result = EXIT_FAILURE;
+                ACC_OPENCL_ERROR("matrix-size causes too large WG-size", result);
+              }
+            }
+            if (EXIT_SUCCESS == result) {
+              opencl_libsmm_smm_t new_config;
+#if defined(OPENCL_LIBSMM_SOURCE_MULTIPLY)
+              result = acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_MULTIPLY,
+                build_options, fname, &new_config.kernel);
+#else
+              result = EXIT_FAILURE;
+#endif
+              if (EXIT_SUCCESS == result) {
+                result = acc_opencl_wgsize(active_device, new_config.kernel,
+                  &max_wgsize, NULL/*preferred_multiple*/);
+                if (EXIT_SUCCESS == result) {
+                  assert(0 < wgsize && 0 < max_wgsize);
+                  /* check planned WG-size against kernel-specific WG-size */
+                  if (wgsize <= max_wgsize) {
+                    if (NULL == config) {
+                      config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_REGISTER(
+                        &key, sizeof(key), sizeof(new_config), &new_config);
+                    }
+                    if (NULL != config) {
+                      config->wgsize = (size_t)wgsize;
+                      config->bs = bs; config->bm = bm; config->bn = bn;
+                      config->kernel = new_config.kernel;
+                    }
+                    else { /* failed to register config */
+                      result = EXIT_FAILURE;
+                    }
+                  }
+                  else {
+                    result = EXIT_FAILURE;
+                    ACC_OPENCL_ERROR("tile-size causes too large WG-size", result);
+                  }
+                }
+              }
+            }
+          }
+          else {
+            result = EXIT_FAILURE;
+            ACC_OPENCL_ERROR("insufficient device capabilities", result);
+          }
+        }
+      }
+      else {
+        result = EXIT_FAILURE;
+      }
+      /* remove configuration from registry to avoid infinitely retrying code generation */
+      if (EXIT_SUCCESS != result && NULL != config) {
+        libxsmm_xrelease(&key, sizeof(key));
+      }
+    }
+    assert(EXIT_SUCCESS != result || /* otherwise config must be valid */
+      (NULL != config && NULL != config->kernel
+        && 1 <= config->bs && 0 < config->bm && 0 < config->bn
+        && 0 < config->wgsize));
+    if (EXIT_SUCCESS == result) {
+      /* adjust overall stacksize according to intra-kernel batchsize */
+      const size_t work_size = ((stack_size + config->bs - 1) / config->bs) * config->wgsize;
+#if defined(OPENCL_LIBSMM_DEBUG_SMM)
+      char *ainp = NULL, *binp = NULL, *cinp = NULL, *test = NULL, *gold = NULL, *btrn = NULL;
+      const libxsmm_gemm_precision precision = (dbcsr_type_real_8 == datatype
+        ? LIBXSMM_GEMM_PRECISION_F64 : (dbcsr_type_real_4 == datatype ? LIBXSMM_GEMM_PRECISION_F32
+        : (libxsmm_gemm_precision)LIBXSMM_DATATYPE_UNSUPPORTED));
+      const int typesize = (dbcsr_type_real_8 == datatype ? 8
+        : (dbcsr_type_real_4 == datatype ? 4 : 0/*unknown*/));
+      size_t asize, bsize, csize;
+      libxsmm_xmmfunction kernel = { NULL };
+      if (  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_a_data),
+              CL_MEM_SIZE, sizeof(size_t), &asize, NULL)
+        &&  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_b_data),
+              CL_MEM_SIZE, sizeof(size_t), &bsize, NULL)
+        &&  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_c_data),
+              CL_MEM_SIZE, sizeof(size_t), &csize, NULL))
+      {
+        const double alpha = 1, beta = 1;
+        libxsmm_descriptor_blob blob;
+        libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob,
+          precision, m_max, n_max, k_max, m_max, k_max, m_max, alpha, beta,
+          LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
+        ainp = (char*)libxsmm_aligned_scratch(asize, 0/*auto-align*/);
+        binp = (char*)libxsmm_aligned_scratch(bsize, 0/*auto-align*/);
+        test = (char*)libxsmm_aligned_scratch(csize, 0/*auto-align*/);
+        gold = (char*)libxsmm_aligned_scratch(csize, 0/*auto-align*/);
+        btrn = (char*)libxsmm_aligned_scratch(k_max * n_max * typesize, 0/*auto-align*/);
+        if (NULL != desc && NULL != ainp && NULL != binp && NULL != test && NULL != gold && NULL != btrn) {
+          ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_a_data, ainp, asize, stream),
+            "transfer debug a-data", result);
+          ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_b_data, binp, bsize, stream),
+            "transfer debug b-data", result);
+          ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_c_data, gold, csize, stream),
+            "transfer debug c-data", result);
+          kernel = libxsmm_xmmdispatch(desc);
+          assert(NULL != kernel.xmm);
+        }
+        else result = EXIT_FAILURE;
+      }
+      else result = EXIT_FAILURE;
+#endif
+      assert(!(OPENCL_LIBSMM_NLOCKS_SMM & (OPENCL_LIBSMM_NLOCKS_SMM - 1))); /* POT */
+      { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */
+        const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975/*seed*/);
+        volatile int *const lock = opencl_libsmm_lock_smm + LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_SMM);
+        LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 0, sizeof(cl_mem), ACC_OPENCL_MEM(dev_c_data)),
+          "set C-matrix argument of SMM-kernel", result);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 1, sizeof(cl_mem), ACC_OPENCL_MEM(dev_a_data)),
+          "set A-matrix argument of SMM-kernel", result);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_b_data)),
+          "set B-matrix argument of SMM-kernel", result);
+        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 3, sizeof(cl_mem), ACC_OPENCL_MEM(dev_param_stack)),
+          "set batch-list argument of SMM-kernel", result);
+        if (1 < config->bs) {
+          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 4, sizeof(int), &stack_size),
+            "set stacksize argument of SMM-kernel", result);
+        }
+        ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(*ACC_OPENCL_STREAM(stream),
+          config->kernel, 1/*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, NULL),
+          "launch SMM-kernel", result);
+        LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED);
+      }
+#if defined(OPENCL_LIBSMM_DEBUG_SMM)
+      ACC_OPENCL_CHECK(acc_memcpy_d2h(dev_c_data, test, csize, stream),
+        "transfer debug test", result);
+#endif
+#if defined(OPENCL_LIBSMM_DEBUG_SMM) || defined(OPENCL_LIBSMM_SYNC)
+      ACC_OPENCL_CHECK(acc_stream_sync(stream), "sync stream", result);
+#endif
+#if defined(OPENCL_LIBSMM_DEBUG_SMM)
+      if (EXIT_SUCCESS == result) {
+        const char *const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
+        const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
+        const int *const params = host_param_stack + (4 <= nparams ? (nparams - 4) : 0);
+        size_t i;
+        fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
+          dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
+          m_max, n_max, k_max, max_kernel_dim, stream);
+        for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
+          const size_t ia = (size_t)(params[i+0] - 1) * typesize;
+          const size_t ib = (size_t)(params[i+1] - 1) * typesize;
+          const size_t ic = (size_t)(params[i+2] - 1) * typesize;
+          assert(ia < asize && ib < bsize && ic < csize);
+          libxsmm_otrans(btrn, binp + ib, typesize, n_max, k_max, n_max, k_max);
+          kernel.xmm(ainp + ia, btrn, gold + ic);
+        }
+        /* some result may be validated multiple times in case of duplicated c-indexes */
+        for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
+          const size_t ic = (size_t)(params[i+2] - 1) * typesize;
+          libxsmm_matdiff_info diff;
+          libxsmm_matdiff(&diff, (libxsmm_datatype)precision, m_max, n_max,
+            gold + ic, test + ic, &m_max/*ldref*/, &m_max/*ldtst*/);
+          if (tolerance < diff.normf_rel) {
+# if  LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
+      LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1014 <= LIBXSMM_VERSION_PATCH
+            fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
+# else
+            fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
+# endif
+# if defined(_DEBUG)
+            opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
+            opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
+            fprintf(stderr, "\n");
+# endif
+            result = EXIT_FAILURE; break;
+          }
+        }
+        if (EXIT_SUCCESS == result) fprintf(stderr, " => OK\n");
+      }
+      libxsmm_free(ainp);
+      libxsmm_free(binp);
+      libxsmm_free(cinp);
+      libxsmm_free(test);
+      libxsmm_free(gold);
+      libxsmm_free(btrn);
+#elif defined(NDEBUG)
+      ACC_OPENCL_UNUSED(host_param_stack);
+      ACC_OPENCL_UNUSED(nparams);
+#endif
+    }
+  }
+  else if (0 < stack_size) { /* inhomogeneous, large kernel, or unsupported datatype */
+    return -1; /* TODO: document result code to trigger host-fallback */
+  }
+  ACC_OPENCL_RETURN(result);
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*__OPENCL*/
diff --git a/src/acc/opencl/smm/opencl_libsmm.h b/src/acc/opencl/smm/opencl_libsmm.h
new file mode 100644
index 00000000000..31ec774447b
--- /dev/null
+++ b/src/acc/opencl/smm/opencl_libsmm.h
@@ -0,0 +1,89 @@
+/*------------------------------------------------------------------------------------------------*
+ * Copyright (C) by the DBCSR developers group - All rights reserved                              *
+ * This file is part of the DBCSR library.                                                        *
+ *                                                                                                *
+ * For information on the license, see the LICENSE file.                                          *
+ * For further information please visit https://dbcsr.cp2k.org                                    *
+ * SPDX-License-Identifier: GPL-2.0+                                                              *
+ *------------------------------------------------------------------------------------------------*/
+#ifndef OPENCL_LIBSMM_H
+#define OPENCL_LIBSMM_H
+
+#include "../../acc_libsmm.h"
+#include "../acc_opencl.h"
+
+#if defined(__LIBXSMM)
+# include <libxsmm.h>
+#else
+# error OpenCL backend currently depends on LIBXSMM!
+#endif
+
+#if !defined(OPENCL_LIBSMM_TRANS_INPLACE) && 0
+# define OPENCL_LIBSMM_TRANS_INPLACE
+#endif
+#if !defined(OPENCL_LIBSMM_PARAMS_DELIMS)
+# define OPENCL_LIBSMM_PARAMS_DELIMS ";, \t|/"
+#endif
+#if !defined(OPENCL_LIBSMM_DEBUG) && 0
+# define OPENCL_LIBSMM_DEBUG
+#endif
+#if !defined(OPENCL_LIBSMM_SYNC) && 0
+# define OPENCL_LIBSMM_SYNC
+#endif
+#if !defined(OPENCL_LIBSMM_CMEM) && 1
+# define OPENCL_LIBSMM_CMEM
+#endif
+#if !defined(OPENCL_LIBSMM_F32) /*&& !defined(__DBCSR_ACC)*/
+# define OPENCL_LIBSMM_F32
+#endif
+#if !defined(OPENCL_LIBSMM_F64) && 1
+# define OPENCL_LIBSMM_F64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** Type for querying transpose kernel/configuration. */
+typedef struct opencl_libsmm_transkey_t {
+  libsmm_acc_data_t type;
+  int m, n;
+} opencl_libsmm_transkey_t;
+
+/** Type for transpose kernel/configuration. */
+typedef struct opencl_libsmm_trans_t {
+  cl_kernel kernel;
+  size_t wgsize;
+} opencl_libsmm_trans_t;
+
+/** Type for querying SMM-kernel/configuration. */
+typedef struct opencl_libsmm_smmkey_t {
+  libsmm_acc_data_t type;
+  int m, n, k;
+} opencl_libsmm_smmkey_t;
+
+/** Type for SMM-kernel/configuration. */
+typedef struct opencl_libsmm_smm_t {
+  cl_kernel kernel;
+  size_t wgsize;
+  /* tuned parameters for SMM-kernels */
+  int bs, bm, bn;
+} opencl_libsmm_smm_t;
+
+/** If buffers are hinted for non-concurrent writes aka "OpenCL constant". */
+int opencl_libsmm_use_cmem(cl_device_id device);
+
+/* Tokenize parambuf and initialize key/value pair. */
+int opencl_libsmm_read_params(char* parambuf,
+  opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value);
+
+#if defined(OPENCL_LIBSMM_DEBUG) && defined(_DEBUG)
+void opencl_libsmm_print_matrix(FILE* ostream, const char* label,
+  libsmm_acc_data_t type, const void* mat, int m, int n);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*OPENCL_LIBSMM_H*/
diff --git a/src/acc/opencl/smm/requirements.txt b/src/acc/opencl/smm/requirements.txt
new file mode 100644
index 00000000000..9767bc7d699
--- /dev/null
+++ b/src/acc/opencl/smm/requirements.txt
@@ -0,0 +1,2 @@
+wheel
+opentuner
diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py
new file mode 100755
index 00000000000..5d871655797
--- /dev/null
+++ b/src/acc/opencl/smm/tune_multiply.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+####################################################################################################
+# Copyright (C) by the DBCSR developers group - All rights reserved                                #
+# This file is part of the DBCSR library.                                                          #
+#                                                                                                  #
+# For information on the license, see the LICENSE file.                                            #
+# For further information please visit https://dbcsr.cp2k.org                                      #
+# SPDX-License-Identifier: GPL-2.0+                                                                #
+####################################################################################################
+#
+# This script is based on OpenTuner's tutorial
+# "Optimizing Block Matrix Multiplication", and
+# LIBXSMM's "xgemm" and "transpose" examples.
+#
+import opentuner
+from opentuner import ConfigurationManipulator
+from opentuner import MeasurementInterface
+from opentuner import IntegerParameter
+from opentuner import Result
+import json
+import glob
+import sys
+import re
+
+
+class SmmTuner(MeasurementInterface):
+    def manipulator(self):
+        """
+        Define the search space by creating a
+        ConfigurationManipulator
+        """
+        self.exepath = "../.."
+        self.exename = "acc_bench_smm"
+        run_result = self.call_program(self.exepath + "/" + self.exename + " 1 1 1")
+        if 0 == run_result["returncode"]:
+            match = re.search(
+                "typename \\(id=([0-9]+)\\):\\s+(\\w+)", str(run_result["stdout"])
+            )
+        else:
+            match = None
+        if (match is not None) and match.group(1) and match.group(2):
+            self.typename = match.group(2)
+            self.typeid = match.group(1)
+        else:
+            sys.tracebacklimit = 0
+            raise RuntimeError(
+                "Setup failed for " + self.exepath + "/" + self.exename + "!"
+            )
+        # sanitize input arguments
+        self.args.m = max(self.args.m, 1)
+        self.args.n = [max(self.args.n, 1), self.args.m][0 == self.args.n]
+        self.args.k = [max(self.args.k, 1), self.args.m][0 == self.args.k]
+        self.args.mb = max(self.args.mb, 1)
+        self.args.bs = max(min(self.args.bs, self.args.mb), 1)
+        self.args.bm = [max(self.args.bm, 1), self.args.m][0 == self.args.bm]
+        self.args.bn = [max(self.args.bn, 1), 1][0 == self.args.bn]
+        self.gflops = 0
+        # setup tunable parameters
+        manipulator = ConfigurationManipulator()
+        manipulator.add_parameter(IntegerParameter("BS", 1, self.args.mb))
+        manipulator.add_parameter(IntegerParameter("BM", 1, self.args.m))
+        manipulator.add_parameter(IntegerParameter("BN", 1, self.args.n))
+        return manipulator
+
+    def seed_configurations(self):
+        return [{"BS": self.args.bs, "BM": self.args.bm, "BN": self.args.bn}]
+
+    def objective(self):
+        return opentuner.search.objective.MaximizeAccuracyMinimizeSize()
+
+    def run(self, desired_result, input, limit):
+        """
+        Compile and run a given configuration then
+        return performance
+        """
+        cfg = desired_result.configuration.data
+        run_cmd = (
+            "OMP_PROC_BIND=TRUE CHECK="
+            + str(self.args.check)
+            + " OPENCL_LIBSMM_SMM_BATCHSIZE="
+            + str(cfg["BS"])
+            + " OPENCL_LIBSMM_SMM_BLOCK_M="
+            + str(cfg["BM"])
+            + " OPENCL_LIBSMM_SMM_BLOCK_N="
+            + str(cfg["BN"])
+            + " "
+            + self.exepath
+            + "/"
+            + self.exename
+            + " 0 0"
+            + " "
+            + str(self.args.m)
+            + " "
+            + str(self.args.n)
+            + " "
+            + str(max(self.args.k, 1))
+        )
+        run_result = self.call_program(run_cmd)
+        if 0 == run_result["returncode"]:
+            match = re.search(
+                "device:\\s+([0-9]+(\\.[0-9]*)*) ms\\s+([0-9]+(\\.[0-9]*)*)",
+                str(run_result["stdout"]),
+            )
+        else:
+            match = None
+        if (match is not None) and match.group(1) and match.group(3):
+            mseconds = float(match.group(1))
+            gflops = float(match.group(3))
+            self.gflops = max(self.gflops, gflops)
+            kernelreq = round(
+                (100.0 * cfg["BM"] * cfg["BN"]) / (self.args.m * self.args.n)
+            )
+            # gflops are reported as "accuracy" (console output)
+            return Result(time=mseconds, accuracy=gflops, size=kernelreq)
+        else:  # return non-competitive/bad result in case of an error
+            return Result(time=float("inf"), accuracy=0.0, size=100.0)
+
+    def save_final_config(self, configuration):
+        """called at the end of tuning"""
+        if 0 < self.gflops:
+            ofilename = (
+                "tune_multiply-"
+                + self.typename
+                + "-"
+                + str(self.args.m)
+                + "x"
+                + str(self.args.n)
+                + "x"
+                + str(self.args.k)
+                + "-"
+                + str(round(self.gflops))
+                + "gflops.json"
+            )
+            print(
+                "Result achieving "
+                + str(self.gflops)
+                + " GFLOPS/s ("
+                + self.typename
+                + ") was written to "
+                + ofilename
+            )
+            # extend result for easier reuse later
+            configuration.data["GFLOPS"] = self.gflops
+            configuration.data["TYPEID"] = self.typeid
+            configuration.data["M"] = self.args.m
+            configuration.data["N"] = self.args.n
+            configuration.data["K"] = self.args.k
+            # self.manipulator().save_to_file(configuration.data, ofilename)
+            with open(ofilename, "w") as ofile:
+                json.dump(configuration.data, ofile)
+                ofile.write("\n")  # append newline at EOF
+            # merge all JSONs into a single CSV file
+            if self.args.csvfile:
+                filenames = glob.glob("*.json")
+                merged = dict()
+                for ifilename in filenames:
+                    with open(ifilename, "r") as ifile:
+                        data = json.load(ifile)
+                        try:
+                            key = (data["TYPEID"], data["M"], data["N"], data["K"])
+                            value = (
+                                data["GFLOPS"],
+                                data["BS"],
+                                data["BM"],
+                                data["BN"],
+                                ifilename,
+                            )
+                            if key not in merged:
+                                merged[key] = value
+                            else:
+                                if merged[key][0] < value[0]:
+                                    ifilename = merged[key][-1]
+                                    merged[key] = value
+                                print(
+                                    "Superfluous "
+                                    + ifilename
+                                    + " ignored when merging CSV file"
+                                )
+                        except KeyError:
+                            print(
+                                "Malformed "
+                                + ifilename
+                                + " ignored when merging CSV file"
+                            )
+                            pass
+                if bool(merged):
+                    with open(self.args.csvfile, "w") as ofile:
+                        ofile.write(  # CSV header line
+                            self.args.csvsep.join(
+                                ["TYPEID", "M", "N", "K", "GFLOPS", "BS", "BM", "BN"]
+                            )
+                            + "\n"
+                        )
+                        for key, value in merged.items():  # CSV data lines
+                            strkey = self.args.csvsep.join([str(k) for k in key])
+                            strval = self.args.csvsep.join([str(v) for v in value[:-1]])
+                            ofile.write(strkey + self.args.csvsep + strval + "\n")
+                    print(
+                        "Merged "
+                        + str(len(merged))
+                        + " of "
+                        + str(len(filenames))
+                        + " JSONs into "
+                        + self.args.csvfile
+                    )
+
+
+if __name__ == "__main__":
+    argparser = opentuner.default_argparser()
+    argparser.add_argument(
+        "m", type=int, default=23, nargs="?", help="Shape of SMM-kernel (M)"
+    )
+    argparser.add_argument(
+        "n", type=int, default=0, nargs="?", help="Shape of SMM-kernel (N)"
+    )
+    argparser.add_argument(
+        "k", type=int, default=0, nargs="?", help="Shape of SMM-kernel (K)"
+    )
+    argparser.add_argument(
+        "-bm",
+        "--initial-bm",
+        type=int,
+        default=0,
+        nargs="?",
+        dest="bm",
+        help="Initial block/tile size (BM)",
+    )
+    argparser.add_argument(
+        "-bn",
+        "--initial-bn",
+        type=int,
+        default=0,
+        nargs="?",
+        dest="bn",
+        help="Initial block/tile size (BN)",
+    )
+    argparser.add_argument(
+        "-bs",
+        "--initial-bs",
+        type=int,
+        default=32,
+        nargs="?",
+        dest="bs",
+        help="Initial (mini-)batch size (BS)",
+    )
+    argparser.add_argument(
+        "-mb",
+        "--max-bs",
+        type=int,
+        default=256,
+        nargs="?",
+        dest="mb",
+        help="Maximum (mini-)batch size (BS)",
+    )
+    argparser.add_argument(
+        "-s",
+        "--csv-separator",
+        type=(lambda c: c if isinstance(c, str) and 1 == len(c) else False),
+        default=";",
+        nargs="?",
+        dest="csvsep",
+        help="Separator used in CSV-file",
+    )
+    argparser.add_argument(
+        "-c",
+        "--csv-filename",
+        type=str,
+        default="tune_multiply.csv",
+        nargs="?",
+        dest="csvfile",
+        help="Generate CSV-file",
+    )
+    argparser.add_argument(
+        "-v",
+        "--check",
+        type=float,
+        default=0,
+        nargs="?",
+        dest="check",
+        help="Validate kernel (epsilon)",
+    )
+    SmmTuner.main(argparser.parse_args())
diff --git a/src/acc/opencl/smm/tune_multiply.sh b/src/acc/opencl/smm/tune_multiply.sh
new file mode 100755
index 00000000000..939b452af5d
--- /dev/null
+++ b/src/acc/opencl/smm/tune_multiply.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+####################################################################################################
+# Copyright (C) by the DBCSR developers group - All rights reserved                                #
+# This file is part of the DBCSR library.                                                          #
+#                                                                                                  #
+# For information on the license, see the LICENSE file.                                            #
+# For further information please visit https://dbcsr.cp2k.org                                      #
+# SPDX-License-Identifier: GPL-2.0+                                                                #
+####################################################################################################
+
+HERE=$(cd "$(dirname "$0")" && pwd -P)
+SED=$(command -v gsed)
+LS=$(command -v ls)
+RM=$(command -v rm)
+WC=$(command -v wc)
+DELAY=12
+
+# GNU sed is desired (macOS)
+if [ "" = "${SED}" ]; then
+  SED=$(command -v sed)
+fi
+
+if [ "$1" ]; then
+  LIMIT=$1
+  shift
+fi
+if [ "$1" ]; then
+  NPARTS=$1
+  shift
+else
+  NPARTS=1
+fi
+if [ "$1" ]; then
+  PART=$1
+  shift
+else
+  PART=1
+fi
+if [ "$1" ]; then
+  TRIPLETS="$*"
+else
+  TRIPLETS=" \
+    4 5 7 9 13 25 26 28 32 45, \
+    13 14 25 26 32, \
+    5 32 13 24 26, \
+    14 16 29, \
+    14 32 29, \
+    16 29 55, \
+    32 29 55, \
+    9 32 22, \
+    4 10 15, \
+    6 7 8, \
+    23, \
+    64, \
+    78, \
+    12, \
+    6"
+fi
+
+if [ "0" != "$((NPARTS<PART))" ]; then
+  >&2 echo "ERROR: part-number ${PART} is larger than the requested ${NPARTS} parts!"
+  exit 1
+fi
+
+if [ "${SED}" ] && [ "${LS}" ] && [ "${RM}" ] && [ "${WC}" ]; then
+  echo "Usage: $0 [seconds-per-kernel [num-parts [part [triplet-spec]]]]"
+  echo "       num-parts and part (one-based), e.g., 12 3"
+  echo "         for this session being the 3rd of 12 sessions"
+  echo "       triplet-spec, e.g.,"
+  echo "         23, 5 32 13 24 26, 4 9"
+  echo
+  for SPECS in $(echo "${TRIPLETS}" | ${SED} -e "s/[[:space:]][[:space:]]*/x/g" -e "s/,/ /g"); do
+    SPEC=$(echo "${SPECS}" | ${SED} -e "s/^x//g" -e "s/x$//g" -e "s/x/,/g")
+    MNKS="${MNKS} $(eval printf "%s" "{${SPEC}}x{${SPEC}}x{${SPEC}}\" \"" | ${SED} -e "s/{//g" -e "s/}//g")"
+  done
+  NTRIPLETS=$(echo "${MNKS}" | wc -w)
+  PARTSIZE=$(((NTRIPLETS+NPARTS-1)/NPARTS))
+  PARTOFFS=$(((PART-1)*PARTSIZE))
+  PARTSIZE=$((PARTSIZE<=(NTRIPLETS-PARTOFFS)?PARTSIZE:(NTRIPLETS-PARTOFFS)))
+  if [ "0" != "$((NPARTS<=NTRIPLETS))" ]; then
+    echo "Session ${PART} of ${NPARTS} part(s)."
+  else
+    echo "Session ${PART} of ${NPARTS} part(s). The problem is over-decomposed!"
+  fi
+  if [ "${LIMIT}" ]; then
+    HRS=$((LIMIT*PARTSIZE/3600))
+    MNS=$(((LIMIT*PARTSIZE-HRS*3600+59)/60))
+    echo "Tuning ${PARTSIZE} kernels in this session will take about ${HRS}h${MNS}m."
+    LIMIT="--stop-after=${LIMIT}"
+  else
+    echo "Tuning ${PARTSIZE} kernels will take an unknown time (no limit given)."
+  fi
+  NJSONS=$(${LS} -1 ./*.json 2>/dev/null | ${WC} -l)
+  if [ "0" != "${NJSONS}" ]; then
+    echo "There are already ${NJSONS} (unrelated?) JSON-files found."
+  fi
+  SLEEP=$(command -v sleep)
+  if [ "${DELAY}" ] && [ "${SLEEP}" ]; then
+    echo
+    echo "Tuning will start in ${DELAY} seconds. Hit CTRL-C to abort."
+    ${SLEEP} ${DELAY}
+  fi
+  N=0
+  for MNK in ${MNKS}; do
+    if [ "0" != "$((PARTOFFS<=N))" ]; then
+      TRIPLET=$(echo "${MNK}" | ${SED} "s/x/ /g")
+      echo
+      echo "Started auto-tuning ${MNK}-kernel..."
+      # avoid mixing database of previous results into new session
+      ${RM} -rf "${HERE}/opentuner.db"
+      eval "${HERE}/tune_multiply.py ${TRIPLET} --no-dups ${LIMIT}"
+    fi
+    N=$((N+1))
+  done
+else
+  >&2 echo "ERROR: missing prerequisites!"
+  exit 1
+fi
diff --git a/src/cmake/DBCSRConfig.cmake.in b/src/cmake/DBCSRConfig.cmake.in
index d229168f2d2..7cbb327254a 100644
--- a/src/cmake/DBCSRConfig.cmake.in
+++ b/src/cmake/DBCSRConfig.cmake.in
@@ -12,13 +12,13 @@ if (@USE_OPENMP@)
   find_dependency(OpenMP)
 endif ()
 
-if (@USE_CUDA@)
+if ("@USE_ACCEL@" MATCHES "cuda")
   enable_language(CUDA)
 endif ()
 
-if ("@USE_SMM@" MATCHES "libxsmm")
+if (("@USE_SMM@" MATCHES "libxsmm") OR ("@USE_ACCEL@" MATCHES "opencl"))
   find_package(PkgConfig)
-  pkg_check_modules(deps REQUIRED IMPORTED_TARGET libxsmmf)
+  pkg_check_modules(LIBXSMM IMPORTED_TARGET GLOBAL libxsmmf)
 endif ()
 
 include("${CMAKE_CURRENT_LIST_DIR}/DBCSRTargets.cmake")
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b33ba5800ab..c1c47840e95 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -15,7 +15,7 @@ endif ()
 
 # =================================== DBCSR PERF TESTS
 set(DBCSR_PERF_SRCS dbcsr_performance_driver.F dbcsr_performance_multiply.F)
-if (USE_HIP)
+if (USE_ACCEL MATCHES "hip")
   hip_add_executable(dbcsr_perf ${DBCSR_PERF_SRCS})
 else ()
   add_executable(dbcsr_perf ${DBCSR_PERF_SRCS})
@@ -66,6 +66,13 @@ set(DBCSR_TESTS_FTN
     dbcsr_tas_unittest
     dbcsr_test_csr_conversions)
 
+if (NOT USE_ACCEL STREQUAL "")
+  # TODO: enable dbcsr_acc_test for HIP
+  if (NOT USE_ACCEL MATCHES "hip")
+    set(DBCSR_TESTS_C dbcsr_acc_test_c)
+  endif ()
+endif ()
+
 if (NOT (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray"))
   set(DBCSR_TESTS_SRCS_CPP dbcsr_tensor_test.cpp)
 endif ()
@@ -79,6 +86,7 @@ set(dbcsr_tensor_unittest_SRCS dbcsr_tensor_unittest.F)
 set(dbcsr_tas_unittest_SRCS dbcsr_tas_unittest.F)
 set(dbcsr_test_csr_conversions_SRCS dbcsr_test_csr_conversions.F)
 set(dbcsr_tensor_test_cpp_SRCS dbcsr_tensor_test.cpp)
+set(dbcsr_acc_test_c_SRCS dbcsr_acc_test.c)
 
 # Make a list of the source files of fortran tests
 set(DBCSR_TESTS_SRCS_FTN)
@@ -106,7 +114,7 @@ target_link_libraries(dbcsr_unittest_common PUBLIC dbcsr)
 
 # Compile Fortran tests
 foreach (dbcsr_test ${DBCSR_TESTS_FTN})
-  if (USE_HIP)
+  if (USE_ACCEL MATCHES "hip")
     hip_add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS})
   else ()
     add_executable(${dbcsr_test} ${${dbcsr_test}_SRCS})
@@ -162,6 +170,30 @@ if (WITH_C_API)
                                           OMP_NUM_THREADS=${TEST_OMP_THREADS})
     endif ()
   endforeach ()
+
+  foreach (dbcsr_test_c ${DBCSR_TESTS_C})
+    if (USE_ACCEL MATCHES "hip")
+      hip_add_executable(${dbcsr_test_c} ${${dbcsr_test_c}_SRCS})
+    else ()
+      add_executable(${dbcsr_test_c} ${${dbcsr_test_c}_SRCS})
+    endif ()
+    target_link_libraries(${dbcsr_test_c} dbcsr_c)
+    # register unittest executable with CMake
+    if (USE_MPI)
+      separate_arguments(MPIEXEC_PREFLAGS)
+      add_test(
+        NAME ${dbcsr_test_c}
+        COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${num_ranks}
+                ${MPIEXEC_PREFLAGS} ./${dbcsr_test_c} ${MPIEXEC_POSTFLAGS})
+    else ()
+      add_test(NAME ${dbcsr_test_c} COMMAND ./${dbcsr_test_c})
+    endif ()
+    if (OpenMP_FOUND)
+      set_tests_properties(
+        ${dbcsr_test_c} PROPERTIES ENVIRONMENT
+                                   OMP_NUM_THREADS=${TEST_OMP_THREADS})
+    endif ()
+  endforeach ()
 endif ()
 
 # =================================== GPU BACKEND TESTS (CUDA / HIP)
@@ -199,7 +231,7 @@ add_custom_command(
 add_custom_target(generate_libsmm_acc_timer_multiply_test_cpp
                   DEPENDS libsmm_acc_timer_multiply.cpp)
 
-if (USE_CUDA OR USE_HIP)
+if (USE_ACCEL MATCHES "cuda|hip")
 
   # All libsmm_acc tests
   set(LIBSMM_ACC_TESTS_SRCS
@@ -212,7 +244,7 @@ if (USE_CUDA OR USE_HIP)
                              libsmm_acc_unittest_transpose)
 
   # Add executables for all libsmm_acc tests
-  if (USE_CUDA)
+  if (USE_ACCEL MATCHES "cuda")
 
     foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS})
 
@@ -232,7 +264,7 @@ if (USE_CUDA OR USE_HIP)
 
     endforeach ()
 
-  else () # i.e. USE_HIP
+  elseif (USE_ACCEL MATCHES "hip")
 
     foreach (libsmm_acc_test ${LIBSMM_ACC_TESTS_SRCS})
       set_source_files_properties(${libsmm_acc_test}