diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000000..b721f806e4f
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,178 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+InsertTrailingCommas: None
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+PPIndentWidth:   -1
+ReferenceAlignment: Pointer
+ReflowComments:  false
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+  - NS_SWIFT_NAME
+  - CF_SWIFT_NAME
+...
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 52d81d02172..c566a3dc630 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,11 +5,13 @@ exclude: '^tools/(build_utils/fypp)'
 fail_fast: false
 repos:
 - repo: https://github.com/ambv/black
-  rev: 21.5b1
+  rev: 22.1.0
   hooks:
   - id: black
+    name: Reformat Python files with the black code formatter
+    files: '^.*(/PACKAGE)|(\.py)$'
 - repo: https://gitlab.com/pycqa/flake8
-  rev: 3.8.4
+  rev: 4.0.1
   hooks:
   - id: flake8
     exclude: >-
@@ -17,10 +19,12 @@ repos:
         .cp2k/.*|
       )$
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.4.0
+  rev: v4.1.0
   hooks:
   - id: check-ast
   - id: check-yaml
+  - id: check-symlinks
+  - id: trailing-whitespace
 - repo: https://github.com/pseewald/fprettify
   rev: v0.3.7
   hooks:
diff --git a/LICENSE b/LICENSE
index c4dec7ebadc..e648aa59503 100644
--- a/LICENSE
+++ b/LICENSE
@@ -291,7 +291,7 @@ convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
 
     DBCSR: Distributed Block Compressed Sparse Row matrix library
-    Copyright (C) by the DBCSR developers group - All rights reserved 
+    Copyright (C) by the DBCSR developers group - All rights reserved
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
diff --git a/docs/guide/3-developer-guide/1-tooling/index.md b/docs/guide/3-developer-guide/1-tooling/index.md
index 8470abe8b65..bf113b8966e 100644
--- a/docs/guide/3-developer-guide/1-tooling/index.md
+++ b/docs/guide/3-developer-guide/1-tooling/index.md
@@ -2,7 +2,7 @@ title: Tooling
 
 # Build System
 
-We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and 
+We support CMake for compilation. See [here](../../2-user-guide/1-installation/index.html) on how to compile and
 [here](../../2-user-guide/1-installation/1-cmake-build-recipes.html) for more CMake details.
 
 Compilations is based on [Fypp](https://github.com/aradi/fypp) meta-progamming package, which is available as submodule.
diff --git a/docs/guide/3-developer-guide/4-performance/1-insights.md b/docs/guide/3-developer-guide/4-performance/1-insights.md
index fc30fd1615c..7847c782ad8 100644
--- a/docs/guide/3-developer-guide/4-performance/1-insights.md
+++ b/docs/guide/3-developer-guide/4-performance/1-insights.md
@@ -102,7 +102,7 @@ The columns describe:
     - `AVERAGE`: averaged over all MPI ranks
     - `MAXIMUM`: maximum over all MPI ranks
     - `AVERAGE` and `MAXIMUM` can be used to locate load-imbalance or synchronization points.
-- `MAXRANKS`: 
+- `MAXRANKS`:
 
 #### Time spent in Just-In-Time (JIT) Compilation
 
diff --git a/src/acc/libsmm_acc/predict/README.md b/src/acc/libsmm_acc/predict/README.md
index f3cee3c785a..8f3bf3ad727 100644
--- a/src/acc/libsmm_acc/predict/README.md
+++ b/src/acc/libsmm_acc/predict/README.md
@@ -96,7 +96,7 @@ Explore the data interactively using the [provided Jupyter notebook](notebooks/i
 
 #### 4. Train
 
-For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance. 
+For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance.
 
 ```bash
 ./predict_train.py  # --algo medium --folder /scratch/autotuning_dataset, e.g.
diff --git a/src/acc/libsmm_acc/predict/predict_helpers.py b/src/acc/libsmm_acc/predict/predict_helpers.py
index 1ff82f7a887..a67de2081fc 100644
--- a/src/acc/libsmm_acc/predict/predict_helpers.py
+++ b/src/acc/libsmm_acc/predict/predict_helpers.py
@@ -29,7 +29,7 @@ def safe_pickle(data, file):
     :param data: data to be pickled
     :param file: file to pickle it into
     """
-    max_bytes = 2 ** 31 - 1  # Maximum number of bytes to write in one chunk
+    max_bytes = 2**31 - 1  # Maximum number of bytes to write in one chunk
     pickle_out = pickle.dumps(data)
     n_bytes = len(pickle_out)
     with open(file, "wb") as f:
@@ -47,7 +47,7 @@ def safe_pickle_load(file_path):
     :param data: data to be loaded through pickle
     :param file: file to read from
     """
-    max_bytes = 2 ** 31 - 1  # Maximum number of bytes to read in one chunk
+    max_bytes = 2**31 - 1  # Maximum number of bytes to read in one chunk
     bytes_in = bytearray(0)
     input_size = os.path.getsize(file_path)
     with open(file_path, "rb") as f:
diff --git a/src/acc/libsmm_acc/predict/predict_train.py b/src/acc/libsmm_acc/predict/predict_train.py
index f752f4d528f..1e6a751bdda 100755
--- a/src/acc/libsmm_acc/predict/predict_train.py
+++ b/src/acc/libsmm_acc/predict/predict_train.py
@@ -489,7 +489,7 @@ def read_data(algo, read_from, nrows, folder, log):
     X = X.drop(cols_to_drop, axis=1)
     log += print_and_log(
         "X    : {:>8,} x {:>8,} ({:>2.2} MB)".format(
-            len(X), len(X.columns), sys.getsizeof(X) / 10 ** 6
+            len(X), len(X.columns), sys.getsizeof(X) / 10**6
         )
     )
     log += print_and_log("Head:")
@@ -505,7 +505,7 @@ def read_data(algo, read_from, nrows, folder, log):
     log += print_and_log("\nRead Y")
     Y = dd.read_parquet(parquet_data_file, columns=["perf_scaled"])
     log += print_and_log(
-        "Y    : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10 ** 6)
+        "Y    : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10**6)
     )
     log += print_and_log("Head:")
     log += print_and_log(Y.head())
@@ -516,7 +516,7 @@ def read_data(algo, read_from, nrows, folder, log):
     X_mnk = dd.read_parquet(parquet_data_file, columns=["mnk"])
     nrows_data = len(X_mnk.index)
     log += print_and_log(
-        "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10 ** 6)
+        "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10**6)
     )
     log += print_and_log("Head:")
     log += print_and_log(X_mnk.head())
diff --git a/src/acc/libsmm_acc/tune/README.md b/src/acc/libsmm_acc/tune/README.md
index bcb8b0bf9b1..d3d0b075ed0 100644
--- a/src/acc/libsmm_acc/tune/README.md
+++ b/src/acc/libsmm_acc/tune/README.md
@@ -160,7 +160,7 @@ Each tune-directory contains a job file. Since there might be many tune-director
 When `tune_submit.py` is called without arguments, it will just list the jobs that could be submitted:
 
 ```bash
-$ ./tune_submit.py 
+$ ./tune_submit.py
           tune_5x5x5: Would submit, run with "doit!"
           tune_5x5x8: Would submit, run with "doit!"
           tune_5x8x5: Would submit, run with "doit!"
diff --git a/src/data/dbcsr.fypp b/src/data/dbcsr.fypp
index 9fb499a28e0..5e648ac9fde 100644
--- a/src/data/dbcsr.fypp
+++ b/src/data/dbcsr.fypp
@@ -55,7 +55,7 @@
 #:set carry = 1
 #:for i in range(0,len(num))
 #:set outi = 0
-#:if carry == 1 
+#:if carry == 1
  #:if num[i] == 0
   #:set outi = 1
   #:set carry = 0
@@ -63,7 +63,7 @@
   #:set outi = 0
   #:set carry = 1
  #:endif
-#:else 
+#:else
  #:set outi = num[i]
 #:endif
 #:mute
@@ -86,11 +86,11 @@ $: numout.append(outi)
  #! generates a list of permutations from n entries
  #! example n = 2 -> [[0,0],[0,1],[1,0],[1,1]] where 0/1 means present/not present
  #:set idx = []
- #:set newidx = [] 
+ #:set newidx = []
  ${init(idx,n)}$
 
  #:set imax = pow(2,n)
- #:for i in range(0,imax) 
+ #:for i in range(0,imax)
         $: permlist.append(idx)
         ${add_num(idx,newidx)}$
         #:set idx = newidx
@@ -104,8 +104,8 @@ $: numout.append(outi)
  #! generates permuted groups of variables from a variable list
  #! optional variables that appear together may be grouped
  #! example: varlist = [[var1], [var2,var3]]
- #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] 
- #:set permlist = [] 
+ #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []]
+ #:set permlist = []
  ${gen_permlist(permlist,len(varlist))}$
  #:for p in permlist
     #:set group = []
@@ -141,7 +141,7 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla
 #:def print_groupif(vargroups,varlist,i,check='PRESENT',prefix='')
 #! for a group [[var1]] and a varlist [[var1]],[var2,var3]]
 #! prints "(ELSE) IF (PRESENT(var1) .AND. .NOT. PRESENT(var2) .AND. .NOT. PRESENT(var3)) THEN"
-#! to be used in a loop 
+#! to be used in a loop
 #:set group = vargroups[i]
 #:set diff = [item for item in varlist if item not in group]
 #:set stat = "ELSE IF"
@@ -157,9 +157,9 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla
 ${flatten(group,flatgroup)}$
 ${flatten(diff,flatdiff)}$
 #:endmute
-$: stat + "(" +  " .AND. ".join([check + "(" +  prefix + str(i) + ")" for i in flatgroup]) & 
+$: stat + "(" +  " .AND. ".join([check + "(" +  prefix + str(i) + ")" for i in flatgroup]) &
          + " .AND. " * (bool(len(diff)) * bool(len(diff) - len(varlist))) &
-         + " .AND. ".join([".NOT. " + check + "(" +  prefix + str(i) + ")" for i in flatdiff]) + ") THEN " 
+         + " .AND. ".join([".NOT. " + check + "(" +  prefix + str(i) + ")" for i in flatdiff]) + ") THEN "
 #:else
 ELSE
 #:endif
diff --git a/src/dbcsr_api_c.F b/src/dbcsr_api_c.F
index 1304cc03b9f..eeb8939c16b 100644
--- a/src/dbcsr_api_c.F
+++ b/src/dbcsr_api_c.F
@@ -1406,7 +1406,7 @@ SUBROUTINE c_dbcsr_get_${var}$ (c_matrix, c_${var}$, c_size) BIND(C, name="c_dbc
             END DO
          #:else
             DO i = 1, c_size
-               c_${var}$ (i) = ${var}$ (i); 
+               c_${var}$ (i) = ${var}$ (i)
             END DO
          #:endif
          NULLIFY (${var}$)
diff --git a/src/ops/PACKAGE b/src/ops/PACKAGE
index 4684cb7adb0..c490ac0e14b 100644
--- a/src/ops/PACKAGE
+++ b/src/ops/PACKAGE
@@ -1,6 +1,6 @@
 {
 "description": "High level DBCSR operations",
 "archive": "libdbcsr",
-"requires": ["../acc", "../mpi", "../data", "../base", "../dist", 
+"requires": ["../acc", "../mpi", "../data", "../base", "../dist",
 	     "../block", "../utils", "../core", "../mm", "../work"],
 }
diff --git a/src/tensors/dbcsr_tensor.fypp b/src/tensors/dbcsr_tensor.fypp
index d66e4b9691b..061022abecd 100644
--- a/src/tensors/dbcsr_tensor.fypp
+++ b/src/tensors/dbcsr_tensor.fypp
@@ -127,7 +127,7 @@ $:    ", ".join(["int** " + name + "_" + str(i) + ", " + "int* " + name + "_" +
 #:set carry = 1
 #:for i in range(0,len(num))
 #:set outi = 0
-#:if carry == 1 
+#:if carry == 1
  #:if num[i] == 0
   #:set outi = 1
   #:set carry = 0
@@ -135,7 +135,7 @@ $:    ", ".join(["int** " + name + "_" + str(i) + ", " + "int* " + name + "_" +
   #:set outi = 0
   #:set carry = 1
  #:endif
-#:else 
+#:else
  #:set outi = num[i]
 #:endif
 #:mute
@@ -158,11 +158,11 @@ $: numout.append(outi)
  #! generates a list of permutations from n entries
  #! example n = 2 -> [[0,0],[0,1],[1,0],[1,1]] where 0/1 means present/not present
  #:set idx = []
- #:set newidx = [] 
+ #:set newidx = []
  ${init(idx,n)}$
 
  #:set imax = pow(2,n)
- #:for i in range(0,imax) 
+ #:for i in range(0,imax)
         $: permlist.append(idx)
         ${add_num(idx,newidx)}$
         #:set idx = newidx
@@ -176,8 +176,8 @@ $: numout.append(outi)
  #! generates permuted groups of variables from a variable list
  #! optional variables that appear together may be grouped
  #! example: varlist = [[var1], [var2,var3]]
- #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []] 
- #:set permlist = [] 
+ #! this gives: vargroups = [ [[var1],[var2,var3]], [[var1]], [[var2,var3]], []]
+ #:set permlist = []
  ${gen_permlist(permlist,len(varlist))}$
  #:for p in permlist
     #:set group = []
@@ -213,7 +213,7 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla
 #:def print_groupif(vargroups,varlist,i,check='PRESENT',prefix='')
 #! for a group [[var1]] and a varlist [[var1]],[var2,var3]]
 #! prints "(ELSE) IF (PRESENT(var1) .AND. .NOT. PRESENT(var2) .AND. .NOT. PRESENT(var3)) THEN"
-#! to be used in a loop 
+#! to be used in a loop
 #:set group = vargroups[i]
 #:set diff = [item for item in varlist if item not in group]
 #:set stat = "ELSE IF"
@@ -229,9 +229,9 @@ $: prefix * (bool(len(group))) + ", ".join([str(i) + ' = ' + str(i) for i in fla
 ${flatten(group,flatgroup)}$
 ${flatten(diff,flatdiff)}$
 #:endmute
-$: stat + "(" +  " .AND. ".join([check + "(" +  prefix + str(i) + ")" for i in flatgroup]) & 
+$: stat + "(" +  " .AND. ".join([check + "(" +  prefix + str(i) + ")" for i in flatgroup]) &
          + " .AND. " * (bool(len(diff)) * bool(len(diff) - len(varlist))) &
-         + " .AND. ".join([".NOT. " + check + "(" +  prefix + str(i) + ")" for i in flatdiff]) + ") THEN " 
+         + " .AND. ".join([".NOT. " + check + "(" +  prefix + str(i) + ")" for i in flatdiff]) + ") THEN "
 #:else
 ELSE
 #:endif
diff --git a/tools/build_libsmm/README b/tools/build_libsmm/README
index aedac067eb5..12544806fe9 100644
--- a/tools/build_libsmm/README
+++ b/tools/build_libsmm/README
@@ -1,11 +1,11 @@
 libsmm: a library for small matrix multiplies.
 
 In order to deal efficiently with small matrix multiplies,
-often involving 'special' matrix dimensions such as 5,13,17,22, 
+often involving 'special' matrix dimensions such as 5,13,17,22,
 a dedicated matrix library can be generated that outperforms (or matches)
 general purpose (optimized) blas libraries.
 
-Generation requires extensive compilation and timing runs, and is machine specific, 
+Generation requires extensive compilation and timing runs, and is machine specific,
 i.e. the library should be constructed on the architecture it is supposed to run.
 
 Users can modify the values inside the file config.in to set which kind of library
@@ -18,7 +18,7 @@ Below you can find the detailed instructions for some examples.
 
 ====================================================================================================================
 a) How to generate the library running several jobs in a cluster, where each
-   node allows for both execution and compilation. 
+   node allows for both execution and compilation.
    For this example we will use a CRAY system with GNU compiler and SLURM.
    Run "./generate -h" to see the meaning of the options.
 
@@ -34,11 +34,11 @@ a) How to generate the library running several jobs in a cluster, where each
       Then run: ./generate -c config/cray.gnu small2
       This command collects all results produced in the small1 phase and it
       generates a file small_gen_optimal_dnn_cray.gnu.out
-      
+
    4) Run: ./generate -c config/cray.gnu -t 16 -w slurm lib
       This commman submit in batch a single job that compiles the library.
       At the end the library is produced inside the directory lib/
-      (libsmm_dnn_cray.gnu.a). 
+      (libsmm_dnn_cray.gnu.a).
 
    5) It is highly recommended to run the final test to check the correctness of the library.
       Run: ./generate -c config/cray.gnu -j 20 -w slurm check1
@@ -51,15 +51,15 @@ a) How to generate the library running several jobs in a cluster, where each
 
 
 ====================================================================================================================
-b) How to generate the library running a single job interactively. 
+b) How to generate the library running a single job interactively.
    For this example we will use a Linux system with GNU compiler.
    Run "./generate -h" to see the meaning of the options.
 
    1) Run: ./generate -c config/linux.gnu -j 10 -t 16 -w none tiny1
-      This command generates, compiles and executes the tiny kernels 
+      This command generates, compiles and executes the tiny kernels
       in 10 groups. Please increase the number of groups (-j <#> option)
       if you get the error "Argument list too long".
-   
+
    2) Run: ./generate -c config/linux.gnu tiny2
       This command collects all results produced in the tiny1 phase and it
       generates a file tiny_gen_optimal_dnn_linux.gnu.out
@@ -88,7 +88,7 @@ c) How to generate the library for the Intel Xeon Phi in batch mode.
    Run "./generate -h" to see the meaning of the options.
    We use the config file mic.intel (inside the directory config).
    Check if all options are OK for your case, in particular:
-    - the target_compile variable with the flag "-offload-attribute-target=mic". 
+    - the target_compile variable with the flag "-offload-attribute-target=mic".
     - the target_compile_offload variable with the flag "-offload=mandatory".
     - Set the MIC_OMP_NUM_THREADS variable to the number of cores on the card.
 
@@ -96,11 +96,11 @@ c) How to generate the library for the Intel Xeon Phi in batch mode.
    Phi. Performance output files are written in the same directory where the
    library is executed on the host, therefore this directory must be exported
    to the Xeon Phi with the right permission (read/write).
-   
+
    1) Run: ./generate -c config/mic.intel -j 100 -t 16 -w slurm tiny1
       This command submits 100 jobs in batch. Each job offloads executions
       to the Intel Xeon Phi card (MIC_OMP_NUM_THREADS threads). Wait until
-      completion of all jobs. 
+      completion of all jobs.
 
    2) Run: ./generate -c config/mic.intel tiny2
       This command collects all results of the tiny1 phase and it generates
@@ -116,7 +116,7 @@ c) How to generate the library for the Intel Xeon Phi in batch mode.
    4) Run: ./generate -c config/mic.intel -t 16 -w slurm lib
       This commman submit in batch a single job that compiles the library.
       At the end the library is produced inside the directory lib/
-      (libsmm_dnn_mic.intel.a). 
+      (libsmm_dnn_mic.intel.a).
 
    5) It is highly recommended to run the final test to check the correctness of the library.
       Run: ./generate -c config/mic.intel -j 200 -w slurm check1
diff --git a/tools/build_libsmm/config.in b/tools/build_libsmm/config.in
index dd6eb64844c..e2412d9985c 100644
--- a/tools/build_libsmm/config.in
+++ b/tools/build_libsmm/config.in
@@ -13,14 +13,14 @@ transpose_flavor=1
 # 1) d => double precision real
 # 2) s => single precision real
 # 3) z => double precision complex
-# 4) c => single precision complex 
+# 4) c => single precision complex
 #
 # select a data_type from the list 1 2 3 4
 #
 data_type=1
 
 #
-# matrix dimensions for which optimized routines will be generated. 
+# matrix dimensions for which optimized routines will be generated.
 # since all combinations of M,N,K are being generated the size of the library becomes very large
 # if too many sizes are being optimized for. Numbers have to be ascending.
 #
diff --git a/tools/build_libsmm/generate b/tools/build_libsmm/generate
index 65e09747be6..91eb0a46911 100755
--- a/tools/build_libsmm/generate
+++ b/tools/build_libsmm/generate
@@ -49,7 +49,7 @@ show_help() {
     echo "                           all: generate, compile and run the kernels."
     echo "                        source: only generate the source file kernels."
     echo "                       compile: generate and compile the kernels."
-    echo "                      Default value is \"${def_target}\"." 
+    echo "                      Default value is \"${def_target}\"."
     echo
     echo "COMMAND is one of the followings:"
     echo "   tiny1  : it runs the tiny phase. Batch execution if requested."
@@ -96,7 +96,7 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do
     case $OPTION in
 	c)
 	    config_file=$OPTARG
-	    ;;  
+	    ;;
 	h)
 	    show_help
 	    ;;
@@ -106,7 +106,7 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do
 	    ;;
 	s)
 	    SIMD=$OPTARG
-	    ;;	    
+	    ;;
 	t)
 	    ntasks=$OPTARG
 	    check_number $ntasks $OPTION $ntasks
@@ -125,10 +125,10 @@ while getopts "c:hj:s:t:w:m:a:" OPTION; do
 		*)
 		    echo "Warning: target \"$OPTARG\" unknown. Run ./generate -h for help."
 		    exit
-		    ;; 
+		    ;;
 	    esac
 	    ;;
-        ?) 
+        ?)
             exit
             ;;
     esac
@@ -366,7 +366,7 @@ case "${transpose_flavor}" in
 	tb="N"
 	decl="A(K,M), B(K,N)"
 	lds="LDA=K ; LDB=K"
-	;; 
+	;;
     3 )
 	type_label+="nt"
 	ta="N"
@@ -400,7 +400,7 @@ case $cmd in
 	;;
     check1|check2)
 	run_dir+="_check"
-	
+
 	;;
 esac
 run_dir+="${type_label}"
diff --git a/tools/build_libsmm/generate.bash b/tools/build_libsmm/generate.bash
index c8c7a7206bb..e3b2a1d1e6a 100644
--- a/tools/build_libsmm/generate.bash
+++ b/tools/build_libsmm/generate.bash
@@ -1,6 +1,6 @@
 #
 # Author: Alfio Lazzaro, alfio.lazzaro@mat.ethz.ch (2013-2015)
-# Library for the generate script used in LIBSMM library    
+# Library for the generate script used in LIBSMM library
 #
 
 #
@@ -24,7 +24,7 @@ write_makefile_header() {
     printf "LIBSMM_INDICES = \$(wordlist \$(LIBSMM_SI),\$(LIBSMM_EI),\$(LIBSMM_DIMS_INDICES))\n\n"
 
     #
-    # output directory for compiled and results files 
+    # output directory for compiled and results files
     #
     printf "LIBSMM_WORKDIR=${work_dir}\n\n"
 
@@ -32,12 +32,12 @@ write_makefile_header() {
     # list of source files
     #
     printf "LIBSMM_SRCFILES=\$(patsubst %%,${prefix_file}_find_%%.f90,\$(LIBSMM_INDICES)) \n"
-    
+
     #
     # list of executables
     #
     printf "LIBSMM_OBJFILES=\$(patsubst %%,\$(LIBSMM_WORKDIR)/${prefix_file}_find_%%.o,\$(LIBSMM_INDICES)) \n"
-    
+
     #
     # list of output files
     #
@@ -176,16 +176,16 @@ collect_results() {
 
 
 do_generate_tiny() {
-    # 
+    #
     # skip the compilation part if it needs only to collect the results
     #
     if [ "$run_cmd" != "true" ]; then
         #
         # compile the generator of tiny mults
         #
-	${host_compile} -c mults.f90 
+	${host_compile} -c mults.f90
 	${host_compile} mults.o tiny_gen.f90 -o tiny_gen.x
-	
+
         #
         # for easy parallelism go via a Makefile
         #
@@ -290,7 +290,7 @@ do_generate_small() {
 		printf " \$@"
 		printf " libxsmm_"
 		printf '`echo $* | awk -F_ '\''{ print $$6"_"$$7"_"$$8" "$$6" "$$7" "$$8" "$$6" "$$8" "$$6 }'\''` '
-		printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}" 
+		printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}"
 	    fi
 	    printf "\n\n"
 	) > ${make_file}
@@ -312,8 +312,8 @@ do_generate_lib() {
 	echo "Abort execution."
 	echo
 	exit
-    fi    
-    
+    fi
+
     #
     # Check if small file exists
     #
@@ -322,13 +322,13 @@ do_generate_lib() {
 	echo "Abort execution."
 	echo
 	exit
-    fi    
+    fi
 
     #
     # compile the generator of small mults
     #
-    ${host_compile} -c mults.f90 
-    ${host_compile} -c multrec_gen.f90 
+    ${host_compile} -c mults.f90
+    ${host_compile} -c multrec_gen.f90
     ${host_compile} mults.o multrec_gen.o lib_gen.f90 -o lib_gen.x
 
     #
@@ -357,7 +357,7 @@ do_generate_lib() {
     eles="(/0"
     for i in `seq 1 $maxsize`
     do
-	
+
 	found=0
 	for myn in ${dims_small}
 	do
@@ -369,12 +369,12 @@ do_generate_lib() {
 	    count=$((count+1))
 	    ele=$count
 	else
-	    ele=0 
+	    ele=0
 	fi
 	eles="$eles,$ele"
     done
     eles="$eles/)"
-   
+
     cd ${run_dir}
 
     file="smm${type_label}.f90"
@@ -401,7 +401,7 @@ do_generate_lib() {
 	printf " IF (N<=$maxsize) THEN\n   in=indx(N)\n ELSE\n   in=0\n ENDIF\n" >> ${file}
 	printf " IF (K<=$maxsize) THEN\n   ik=indx(K)\n ELSE\n   ik=0\n ENDIF\n" >> ${file}
 	printf " itot=(ik*($numsize+1)+in)*($numsize+1)+im\n" >> ${file}
-	
+
 	count=0
 	printf " SELECT CASE(itot)\n" >> ${file}
 	for myk in 0 ${dims_small}
@@ -451,7 +451,7 @@ do_generate_lib() {
 	    printf "END SUBROUTINE smm${type_label}\n\n" >> ${file}
 	fi
     }
-    
+
     write_routine
     write_routine 0
 
@@ -498,7 +498,7 @@ do_generate_lib() {
 	printf "LIBSMM_OBJFILES=\$(patsubst %%,\$(LIBSMM_WORKDIR)/smm${type_label}_%%.o,\$(LIBSMM_DIMS_INDICES)) \n\n"
 
 	printf ".PHONY: \$(LIBSMM_WORKDIR)/\$(LIBSMM_DRIVER) all_libsmm \n\n"
-	
+
 	printf "all_libsmm: \n\n"
 
         #
@@ -530,7 +530,7 @@ do_generate_lib() {
             printf " \$@"
             printf " libxsmm_"
             printf '`echo $* | awk -F_ '\''{ print $$4"_"$$5"_"$$6" "$$4" "$$5" "$$6" "$$4" "$$6" "$$4 }'\''` '
-            printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}\n" 
+            printf "1 1 0 0 ${SIMD_libxsmm} nopf ${data_libxsmm}\n"
         fi
 	printf "\n"
 
@@ -592,7 +592,7 @@ do_check() {
 	exit
     fi
 
-    # 
+    #
     # skip the compilation part if it needs only to collect the results
     #
     if [ "$run_cmd" != "true" ]; then
@@ -621,11 +621,11 @@ do_check() {
 		    if [ $element_end -eq 0 -o $element -gt $element_end ]; then
 			element_end=$(( element_end + nelements_in ))
 			ijob=$(( ijob + 1))
-			
+
 			if [ ${ijob} -le ${nelements_out} ]; then
 			    element_end=$(( element_end + 1))
 			fi
-			
+
 			echo "Preparing test program for job #$ijob..."
 			filename=${test_file}_job$ijob
 
@@ -715,7 +715,7 @@ SUBROUTINE testit(M,N,K)
      CALL MYRAND(C1)
      C2=C1
 
-     CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) 
+     CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M)
      CALL smm${type_label}(M,N,K,A,B,C2)
 
      IF (MAXVAL(ABS(C2-C1))>100*EPSILON(REAL(1.0,KIND=KIND(A(1,1))))) THEN
@@ -732,11 +732,11 @@ SUBROUTINE testit(M,N,K)
 
   A=0; B=0; C1=0 ; C2=0
 
-  CALL CPU_TIME(t1) 
+  CALL CPU_TIME(t1)
   DO i=1,Niter
-     CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M) 
+     CALL ${gemm}("$ta","$tb",M,N,K,one,A,LDA,B,LDB,one,C1,M)
   ENDDO
-  CALL CPU_TIME(t2) 
+  CALL CPU_TIME(t2)
 
   CALL CPU_TIME(t3)
   DO i=1,Niter
@@ -748,7 +748,7 @@ SUBROUTINE testit(M,N,K)
         " smm: ",Niter*flops/(t4-t3)/gflop," Gflops. Linked blas: ",Niter*flops/(t2-t1)/gflop,&
         " Gflops. Performance ratio: ",((t2-t1)/(t4-t3))*100,"%"
 
-END SUBROUTINE 
+END SUBROUTINE
 EOF
 if [ -n "${target_compile_offload}" ]; then
     printf '!dir$ attributes offload:mic :: testit \n ' >> ${work_dir}/${filename}.f90
@@ -796,7 +796,7 @@ EOF
 
 			rm -f ${work_dir}/${filename}.sh
 			#
-			# Prepare the script for compile the benchmarking 
+			# Prepare the script for compile the benchmarking
 			# and testing program for the smm library
 			#
 			(
@@ -816,7 +816,7 @@ EOF
 			${run_cmd} ./${work_dir}/${filename}.sh
 		    fi
 
-		    element=$(( element + 1 ))  
+		    element=$(( element + 1 ))
 
 		done ; done ; done
 
@@ -859,5 +859,5 @@ EOF
 	echo "Final library can be found at ${archive}"
 	echo
     fi
-    
+
 }
diff --git a/tools/build_libsmm/make.gen b/tools/build_libsmm/make.gen
index fe06830b477..c297a85f821 100644
--- a/tools/build_libsmm/make.gen
+++ b/tools/build_libsmm/make.gen
@@ -97,7 +97,7 @@ $(LIBSMM_EXE:.x=.f90):
 	  done; \
 	  printf "\n"; \
 	 fi >> $@
-	@printf "PROGRAM $(notdir $(basename $@)) \n" >> $@ 
+	@printf "PROGRAM $(notdir $(basename $@)) \n" >> $@
 	@printf "  USE omp_lib \n" >> $@
 	@if [ -n "$(LIBSMM_MIC_OFFLOAD)" ]; then printf "  USE mic_lib \n" >> $@ ; fi
 	@printf "  IMPLICIT NONE \n" >> $@