diff --git a/arraymancer.nimble b/arraymancer.nimble
index b8a3178b4..b7ef792d1 100644
--- a/arraymancer.nimble
+++ b/arraymancer.nimble
@@ -229,88 +229,28 @@ task test_mkl_omp, "Run all tests - Intel MKL + OpenMP":
 task test_release, "Run all tests - Release mode":
   test "tests_cpu", " -d:release"
 
-task gen_doc, "Generate Arraymancer documentation":
-  # TODO: Industrialize: something more robust that only check nim files (and not .DS_Store ...)
-  for filePath in listFiles("src/tensor/"):
-    let modName = filePath[11..^5] # Removing src/tensor/ (11 chars) and .nim (4 chars) # TODO: something more robust
-    # Cuda doc is broken https://github.com/nim-lang/Nim/issues/6910
-    # Delete doc comment from nimcuda before using this
-    exec r"nim doc -o:docs/build/tensor." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn_primitives/"):
-    let modName = filePath[18..^5] # Removing src/nn_primitives/ (18 chars) and .nim (4 chars) # TODO: something more robust
-    # Cuda doc is broken https://github.com/nim-lang/Nim/issues/6910
-    # Delete doc comment from nimcuda before using this
-    exec r"nim doc -o:docs/build/nnp." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/autograd/"):
-    let modName = filePath[13..^5] # Removing src/autograd/ (13 chars) and .nim (4 chars) # TODO: something more robust
-    exec r"nim doc -o:docs/build/ag." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn/"):
-    let modName = filePath[7..^5] # Removing src/nn_primitives/ (18 chars) and .nim (4 chars) # TODO: something more robust
-    exec r"nim doc -o:docs/build/nn." & modName & ".html " & filePath
-
-  # TODO auto check subdir
-  for filePath in listFiles("src/nn/activation/"):
-    let modName = filePath[18..^5]
-    exec r"nim doc -o:docs/build/nn_activation." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn/layers/"):
-    let modName = filePath[14..^5]
-    exec r"nim doc -o:docs/build/nn_layers." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn/loss/"):
-    let modName = filePath[12..^5]
-    exec r"nim doc -o:docs/build/nn_loss." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn/optimizers/"):
-    let modName = filePath[18..^5]
-    exec r"nim doc -o:docs/build/nn_optimizers." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/nn_dsl/"):
-    let modName = filePath[11..^5]
-    exec r"nim doc -o:docs/build/nn_dsl." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/linear_algebra/"):
-    let modName = filePath[19..^5]
-    exec r"nim doc -o:docs/build/la." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/stats/"):
-    let modName = filePath[10..^5]
-    exec r"nim doc -o:docs/build/stats." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/ml/clustering/"):
-    let modName = filePath[18..^5]
-    exec r"nim doc -o:docs/build/ml." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/ml/dimensionality_reduction/"):
-    let modName = filePath[32..^5]
-    exec r"nim doc -o:docs/build/ml." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/ml/metrics/"):
-    let modName = filePath[15..^5]
-    exec r"nim doc -o:docs/build/ml." & modName & ".html " & filePath
-
-  block:
-    let filePath = "src/nlp/tokenizers.nim"
-    let modName = filePath[8..^5]
-    exec r"nim doc -o:docs/build/nlp." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/io/"):
-    let modName = filePath[7..^5]
-    exec r"nim doc -o:docs/build/io." & modName & ".html " & filePath
-
-  for filePath in listFiles("src/datasets/"):
-    let modName = filePath[13..^5]
-    exec r"nim doc -o:docs/build/datasets." & modName & ".html " & filePath
-
-  # Process the rst
-  for filePath in listFiles("docs/"):
-    if filePath[^4..^1] == ".rst":
-      let modName = filePath[5..^5]
-      exec r"nim rst2html -o:docs/build/" & modName & ".html " & filePath
-
-  # Copy stylesheets
-  cpFile("docs/docutils.css", "docs/build/docutils.css")
-  cpFile("docs/nav.css", "docs/build/nav.css")
+
+template canImport(x: untyped): untyped =
+  compiles:
+    import x
+
+when canImport(docs / docs):
+  # can define the `gen_docs` task (docs already imported now)
+  # this is to hack around weird nimble + nimscript behavior.
+  # when overwriting an install nimble will try to parse the generated
+  # nimscript file and for some reason then it won't be able to import
+  # the module (even if it's put into `src/`).
+  task gen_docs, "Generate Arraymancer documentation":
+    # generate nimdoc.cfg file so we can generate the correct header for the
+    # index.html page without having to mess with the HTML manually.
+    genNimdocCfg("src/")
+    # build the actual docs and the index
+    buildDocs("src/", "docs/build")
+    # Copy our stylesheets
+    cpFile("docs/docutils.css", "docs/build/docutils.css")
+    cpFile("docs/nav.css", "docs/build/nav.css")
+    # Process the rst
+    for filePath in listFiles("docs/"):
+      if filePath[^4..^1] == ".rst":
+        let modName = filePath[5..^5]
+        exec r"nim rst2html -o:docs/build/" & modName & ".html " & filePath
diff --git a/docs/docs.nim b/docs/docs.nim
new file mode 100644
index 000000000..5bbbdedeb
--- /dev/null
+++ b/docs/docs.nim
@@ -0,0 +1,309 @@
+import macros, strformat, strutils, sequtils, sets, tables, algorithm
+
+from os import parentDir, getCurrentCompilerExe, DirSep, extractFilename, `/`, setCurrentDir
+
+when defined(nimdoc):
+  from os import getCurrentDir, paramCount, paramStr
+
+#[
+This file is a slightly modified version of the same file of `nimterop`:
+https://github.com/nimterop/nimterop/blob/master/nimterop/docs.nim
+]#
+
+
+proc getNimRootDir(): string =
+  #[
+  hack, but works
+  alternatively (but more complex), use (from a nim file, not nims otherwise
+  you get Error: ambiguous call; both system.fileExists):
+  import "$nim/testament/lib/stdtest/specialpaths.nim"
+  nimRootDir
+  ]#
+  fmt"{currentSourcePath}".parentDir.parentDir.parentDir
+
+const
+  DirSep = when defined(windows): '\\' else: '/'
+
+proc execAction(cmd: string): string =
+  var
+    ccmd = ""
+    ret = 0
+  when defined(Windows):
+    ccmd = "cmd /c " & cmd
+  elif defined(posix):
+    ccmd = cmd
+  else:
+    doAssert false
+
+  (result, ret) = gorgeEx(ccmd)
+  doAssert ret == 0, "Command failed: " & $ret & "\ncmd: " & ccmd & "\nresult:\n" & result
+
+template genRemove(name: untyped): untyped =
+  proc `name`(s, toRemove: string): string =
+    result = s
+    result.`name`(toRemove)
+genRemove(removePrefix)
+genRemove(removeSuffix)
+
+proc getFiles*(path: string): seq[string] =
+  # Add files and dirs here, which should be skipped.
+  #const excludeDirs = []
+  #let ExcludeDirSet = toSet(excludeDirs)
+  #if path.extractFilename in ExcludeDirSet: return
+  # The files below are not valid by themselves, they are only included
+  # from other files
+  const excludeFiles = [ "blas_l3_gemm_aux.nim",
+                         "blas_l3_gemm_data_structure.nim",
+                         "blas_l3_gemm_macro_kernel.nim",
+                         "blas_l3_gemm_micro_kernel.nim",
+                         "blas_l3_gemm_packing.nim",
+                         "p_checks_cuda.nim",
+                         "p_checks_opencl.nim",
+                         "blis_api.nim" ]
+  let ExcludeFileSet = toSet(excludeFiles)
+
+  for file in listFiles(path):
+    if file.endsWith(".nim") and file.extractFilename notin ExcludeFileSet:
+      result.add file
+  for dir in listDirs(path):
+    result.add getFiles(dir)
+
+import nimDocTemplates
+
+proc buildDocs*(path: string, docPath: string, baseDir = getProjectPath() & $DirSep,
+                masterBranch = "master",
+                defines: openArray[string] = @[]) =
+  ## Generate docs for all nim files in `path` and output all HTML files to the
+  ## `docPath` in a flattened form (subdirectories are removed).
+  ##
+  ## If duplicate filenames are detected, they will be printed at the end.
+  ##
+  ## `baseDir` is the project path by default and `files` and `path` are relative
+  ## to that directory. Set to "" if using absolute paths.
+  ##
+  ## `masterBranch` is the name of the default branch to which the docs should link
+  ## when clicking the `Source` button below a procedure etc.
+  ##
+  ## `defines` is a list of `-d:xxx` define flags (the `xxx` part) that should be passed
+  ## to `nim doc` so that `getHeader()` is invoked correctly.
+  ##
+  ## Use the `--publish` flag with nimble to publish docs contained in
+  ## `path` to Github in the `gh-pages` branch. This requires the ghp-import
+  ## package for Python: `pip install ghp-import`
+  ##
+  ## WARNING: `--publish` will destroy any existing content in this branch.
+  ##
+  ## NOTE: `buildDocs()` only works correctly on Windows with Nim 1.0+ since
+  ## https://github.com/nim-lang/Nim/pull/11814 is required.
+  when defined(windows) and (NimMajor, NimMinor, NimPatch) < (1, 0, 0):
+    echo "buildDocs() unsupported on Windows for Nim < 1.0 - requires PR #11814"
+  else:
+    let
+      baseDir =
+        if baseDir == $DirSep:
+          getCurrentDir() & $DirSep
+        else:
+          baseDir
+      docPath = baseDir & docPath
+      path = baseDir & path
+      defStr = block:
+        var defStr = ""
+        for def in defines:
+          defStr &= " -d:" & def
+        defStr
+      nim = getCurrentCompilerExe()
+
+    # now we walk the whole `path` and build the documentation for each `.nim` file.
+    # While doing that we flatten the directory structure for the generated HTML files.
+    # `src/foo/bar/baz.nim` just becomes
+    # `docPath/baz.html`.
+    # This allows for all files to be in the `docPath` directory, which means each
+    # file will be able to find the `dochack.js` file, which will be put into
+    # the `docPath` directory, too (the inclusion of the `dochack.js` is done statically
+    # via our generated nimdoc.cfg file and is fixed for each generated HTML).
+    let files = getFiles(path)
+    var idx = 0
+    var fileSet = initHashSet[string]()
+    var duplSet = initHashSet[string]()
+    for file in files:
+      let baseName = file.extractFilename()
+      let relPath = file.removePrefix(path).removeSuffix(baseName)
+      let prefix = relPath.strip(chars = {'/'}) # remove possible trailing `/`
+        .split('/') # split path parts
+        .join(".") # concat by `.` instead
+      var outfile = baseName.replace(".nim", ".html")
+      if outfile in fileSet:
+        duplSet.incl outfile
+      else:
+        fileSet.incl outfile
+      outfile = docPath / outfile
+      echo "Processing: ", outfile, " [", idx, "/", files.len, "]"
+      # NOTE: Changing the current working directory to the project path is required in order for
+      # `git.commit:` to work! Otherwise we sit in `docs` and for some reason the relative path
+      # will eat one piece of the resulting `source` links and thereby removing the actual branch
+      # and we end up with a broken link!
+      echo execAction(&"cd {getProjectPath()} && {nim} doc {defStr} --git.commit:{masterBranch} -o:{outfile} --index:on {file}")
+      inc idx
+    ## now build  the index
+    echo execAction(&"{nim} buildIndex -o:{docPath}/theindex.html {docPath}")
+    when declared(getNimRootDir):
+      #[
+      NOTE: running it locally doesn't work anymore on modern chromium browser,
+      because they block "access from origin 'null' due to CORS policy".
+      this enables doc search, works at least locally with:
+      cd {docPath} && python -m SimpleHTTPServer 9009
+      ]#
+      echo execAction(&"{nim} js -o:{docPath}/dochack.js {getNimRootDir()}/tools/dochack/dochack.nim")
+
+    for i in 0 .. paramCount():
+      if paramStr(i) == "--publish":
+        echo execAction(&"cd {docPath} && ghp-import --no-jekyll -fp {docPath}")
+        break
+
+    # echo "Processed files: ", fileSet
+    if duplSet.card > 0:
+      echo "WARNING: Duplicate filenames detected: ", duplSet
+
+
+let nameMap = {
+  "dsl_core" : "Neural network: Declaration",
+  "relu" : "Activation: Relu (Rectified linear Unit)",
+  "sigmoid" : "Activation: Sigmoid",
+  "tanh" : "Activation: Tanh",
+  "conv2D" : "Layers: Convolution 2D",
+  "embedding" : "Layers: Embedding",
+  "gru" : "Layers: GRU (Gated Linear Unit)",
+  "linear" : "Layers: Linear/Dense",
+  "maxpool2D" : "Layers: Maxpool 2D",
+  "cross_entropy_losses" : "Loss: Cross-Entropy losses",
+  "mean_square_error_loss" : "Loss: Mean Square Error",
+  "softmax" : "Softmax",
+  "optimizers" : "Optimizers",
+  "init" : "Layers: Initializations",
+
+  "reshape_flatten" : "Reshape & Flatten",
+
+  "decomposition" : "Eigenvalue decomposition",
+  "decomposition_rand" : "Randomized Truncated SVD",
+  "least_squares" : "Least squares solver",
+  "linear_systems" : "Linear systems solver",
+  "special_matrices" : "Special linear algebra matrices",
+  "stats" : "Statistics",
+  "pca" : "Principal Component Analysis (PCA)",
+  "accuracy_score" : "Accuracy score",
+  "common_error_functions" : "Common errors, MAE and MSE (L1, L2 loss)",
+  "kmeans" : "K-Means",
+
+  "mnist" : "MNIST",
+  "imdb" : "IMDB",
+  "io_csv" : "CSV reading and writing",
+  "io_hdf5" : "HDF5 files reading and writing",
+  "io_image" : "Images reading and writing",
+  "io_npy" : "Numpy files reading and writing",
+
+  "autograd_common" : "Data structure",
+  "gates_basic" : "Basic operations",
+  "gates_blas" : "Linear algebra operations",
+  "gates_hadamard" : "Hadamard product (elementwise matrix multiply)",
+  "gates_reduce" : "Reduction operations",
+  "gates_shapeshifting_concat_split" : "Concatenation, stacking, splitting, chunking operations",
+  "gates_shapeshifting_views" : "Linear algebra operations",
+
+  "nnp_activation" : "Activations",
+  "nnp_convolution" : "Convolution 2D",
+  "nnp_conv2d_cudnn" : "Convolution 2D - CuDNN",
+  "nnp_embedding" : "Embeddings",
+  "nnp_gru" : "Gated Recurrent Unit (GRU)",
+  "nnp_linear" : "Linear / Dense layer",
+  "nnp_maxpooling" : "Maxpooling",
+  "nnp_numerical_gradient" : "Numerical gradient",
+  "nnp_sigmoid_cross_entropy" : "Sigmoid Cross-Entropy loss",
+  "nnp_softmax_cross_entropy" : "Softmax Cross-Entropy loss",
+  "nnp_softmax" : "Softmax"
+}.toTable
+
+proc wrap(name: string): string =
+  const tmpl = """<li><a href="$#">$#</a></li>"""
+  if name in nameMap:
+    result = tmpl % [name & ".html", nameMap[name]]
+  else:
+    result = tmpl % [name & ".html", name]
+
+proc getHeaderMap(path: string): seq[seq[string]] =
+  ## returns a nesteed seq where each element is a `seq[string]` containing
+  ## all elements to be added to the header at the index. The index
+  ## corresponds to the `$N` of the `nimDocTemplates.headerTmpl` field.
+  const excludeFiles = [ "nn", # only imports and exports `NN` files
+                         "nn_dsl", # only imports and exports `NN DSL` files
+                         "ml", # only imports and exports `ML` files
+                         "io", # only imports and exports `io` files
+                         "autograd", # only imports and exports `autograd` files
+                         "blis" # doesn't import or export anything
+  ]
+  let ExcludeFileSet = toSet(excludeFiles)
+  # map of the different header categories
+  let catMap = { "tensor" : 1,
+                 "nn" : 2,
+                 "nn_dsl" : 2,
+                 "linear_algebra" : 3,
+                 "stats" : 3,
+                 "ml" : 3,
+                 "datasets" : 4,
+                 "io" : 4,
+                 "autograd" : 5 ,
+                 "nn_primitives" : 6,
+                 "nlp" : 7,
+                 "math_ops_fusion" : 7,
+                 "laser" : 7,
+                 "private" : 7}.toTable
+
+  # `indexOverride` is used to override the index of the header the file
+  # is added to. Some files may be part of e.g. `tensor` but shouldn't be
+  # listed there, since they aren't that important.
+  # NOTE: the elements here are ``filenames`` and ``not`` directories!
+  let indexOverride = { "global_config" : 7 }.toTable
+  let files = getFiles(path)
+
+  result = newSeq[seq[string]](7)
+  for file in files:
+    let baseName = file.extractFilename()
+    let outfile = baseName.replace(".nim", "")
+    if outfile in ExcludeFileSet: continue
+    let subDir = file.removePrefix(path).split('/')[0]
+    if subDir in catMap:
+      var idx: int
+      if outfile notin indexOverride:
+        idx = catMap[subDir] - 1
+      else:
+        idx = indexOverride[outfile] - 1
+      result[idx].add outfile
+
+proc genNimdocCfg*(path: string) =
+  ## This proc generates the `nimdoc.cfg`, which sits at the root of the
+  ## arraymancer repository. We generate it so that we can combine the
+  ## front page template derived from flyx's NimYaml: https://github.com/flyx/NimYAML
+  ## with the standard Nim document generation. We generate the fields for
+  ## the header links from the actual files found in each diretory.
+  ##
+  ## NOTE: manual intervention is required for each directory that is added
+  ## and should show up as its own tab in the header. Essentially look at the
+  ## `$<number>` spans in the `docFileTmpl` above to see what to do.
+  let headerMap = getHeaderMap(path)
+  # create the strings based on the header map for each span
+  var spans = newSeq[string](7)
+  for idx in 0 ..< spans.len:
+    spans[idx] = headerMap[idx].sorted.mapIt(wrap(it)).join("\n")
+  # fill the HTML generation template from the filenames
+  let htmlTmpl = headerTmpl % [ spans[0], spans[1], spans[2],
+                                spans[3], spans[4], spans[5],
+                                spans[6]]
+  # first "header"
+  var fdata = ""
+  fdata.add("# Arraymancer documentation generation\n\n")
+  fdata.add(&"git.url = \"{gitUrl}\"\n\n")
+  fdata.add(&"doc.item.seesrc = \"\"\"{docItemSeeSrc}\"\"\"\n\n")
+  # finally write the HTML document template
+  fdata.add(&"doc.file = \"\"\"{docFileTmpl}{htmlTmpl}\"\"\"\n")
+
+  # now build the content for the spans
+  writeFile(getProjectPath() & $DirSep & "nimdoc.cfg", fdata)
diff --git a/docs/index.rst b/docs/index.rst
index b0ac76120..9354d1e44 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -144,7 +144,7 @@ Installation:
 Nim is available in some Linux repositories and on Homebrew for macOS.
 
 I however recommend installing Nim in your user profile via
-```choosenim`` <https://github.com/dom96/choosenim>`__. Once choosenim
+`choosenim <https://github.com/dom96/choosenim>`_. Once choosenim
 installed Nim, you can ``nimble install arraymancer`` which will pull
 arraymancer and all its dependencies.
 
diff --git a/docs/nav.css b/docs/nav.css
index 7efa28b2c..0a35153b0 100644
--- a/docs/nav.css
+++ b/docs/nav.css
@@ -77,6 +77,12 @@ header span ul.monospace a {
     font-family: "Source Code Pro", Menlo, "Courier New", Courier, monospace;
 }
 
+header span ul span ul {
+    max-height: 800px;/* you can change as you need it */
+    overflow:auto;/* to get scroll */
+}
+
+
 header a:link,
 header a:visited {
     background: inherit;
diff --git a/docs/nimDocTemplates.nim b/docs/nimDocTemplates.nim
new file mode 100644
index 000000000..e3e463cac
--- /dev/null
+++ b/docs/nimDocTemplates.nim
@@ -0,0 +1,194 @@
+const gitUrl* = "https://github.com/mratsim/arraymancer"
+
+const docItemSeeSrc* = """&nbsp;&nbsp;<a
+href="${url}/tree/${commit}/${path}#L${line}"
+class="link-seesrc" target="_blank">Source</a>
+<a href="${url}/edit/master/${path}#L${line}" class="link-seesrc" target="_blank" >Edit</a>
+"""
+
+# TODO: industrialize similar to Nim website: https://github.com/nim-lang/Nim/blob/e758b9408e8fe935117f7f793164f1c9b74cec06/tools/nimweb.nim#L45
+# And: https://github.com/nim-lang/Nim/blob/d3f966922ef4ddd05c137f82e5b2329b3d5dc485/web/website.ini#L31
+
+# TODO: move the technical reference to the end (need some CSS so that elements are properly placed)
+
+const docFileTmpl* = """<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!--  This file is generated by Nim. -->
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<!-- Favicon -->
+<link rel="shortcut icon" href="data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAUAAAAF////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAIAAABbAAAAlQAAAKIAAACbAAAAmwAAAKIAAACVAAAAWwAAAAL///8A////AP///wD///8A////AAAAABQAAADAAAAAYwAAAA3///8A////AP///wD///8AAAAADQAAAGMAAADAAAAAFP///wD///8A////AP///wAAAACdAAAAOv///wD///8A////AP///wD///8A////AP///wD///8AAAAAOgAAAJ3///8A////AP///wAAAAAnAAAAcP///wAAAAAoAAAASv///wD///8A////AP///wAAAABKAAAAKP///wAAAABwAAAAJ////wD///8AAAAAgQAAABwAAACIAAAAkAAAAJMAAACtAAAAFQAAABUAAACtAAAAkwAAAJAAAACIAAAAHAAAAIH///8A////AAAAAKQAAACrAAAAaP///wD///8AAAAARQAAANIAAADSAAAARf///wD///8AAAAAaAAAAKsAAACk////AAAAADMAAACcAAAAnQAAABj///8A////AP///wAAAAAYAAAAGP///wD///8A////AAAAABgAAACdAAAAnAAAADMAAAB1AAAAwwAAAP8AAADpAAAAsQAAAE4AAAAb////AP///wAAAAAbAAAATgAAALEAAADpAAAA/wAAAMMAAAB1AAAAtwAAAOkAAAD/AAAA/wAAAP8AAADvAAAA3gAAAN4AAADeAAAA3gAAAO8AAAD/AAAA/wAAAP8AAADpAAAAtwAAAGUAAAA/AAAA3wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAADfAAAAPwAAAGX///8A////AAAAAEgAAADtAAAAvwAAAL0AAADGAAAA7wAAAO8AAADGAAAAvQAAAL8AAADtAAAASP///wD///8A////AP///wD///8AAAAAO////wD///8A////AAAAAIcAAACH////AP///wD///8AAAAAO////wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A//8AAP//AAD4HwAA7/cAAN/7AAD//wAAoYUAAJ55AACf+QAAh+EAAAAAAADAAwAA4AcAAP5/AAD//wAA//8AAA=="/>
+<link rel="icon" type="image/png" sizes="32x32" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAA3XAAAN1wFCKJt4AAAAB3RJTUUH4QQQEwksSS9ZWwAAAk1JREFUWMPtll2ITVEUx39nn/O7Y5qR8f05wtCUUr6ZIS++8pEnkZInPImneaCQ5METNdOkeFBKUhMPRIkHKfEuUZSUlGlKPN2TrgfncpvmnntnmlEyq1Z7t89/rf9a6+y99oZxGZf/XeIq61EdtgKXgdXA0xrYAvBjOIF1AI9zvjcC74BSpndrJPkBWDScTF8Aa4E3wDlgHbASaANmVqlcCnwHvgDvgVfAJ+AikAAvgfVZwLnSVZHZaOuKoQi3ZOMi4NkYkpe1p4J7A8BpYAD49hfIy/oqG0+hLomiKP2L5L+1ubn5115S+3OAn4EnwBlgMzCjyt6ZAnQCJ4A7wOs88iRJHvw50HoujuPBoCKwHWiosy8MdfZnAdcHk8dxXFJ3VQbQlCTJvRBCGdRbD4M6uc5glpY3eAihpN5S5w12diSEcCCEcKUO4ljdr15T76ur1FDDLIQQ3qv71EdDOe3Kxj3leRXyk+pxdWnFWod6Wt2bY3de3aSuUHcPBVimHs7mK9WrmeOF6lR1o9qnzskh2ar2qm1qizpfXaPeVGdlmGN5pb09qMxz1Xb1kLqgzn1RyH7JUXW52lr5e/Kqi9qpto7V1atuUzfnARrV7jEib1T76gG2qxdGmXyiekkt1GswPTtek0aBfJp6YySGBfWg2tPQ0FAYgf1stUfdmdcjarbYJEniKIq6gY/Aw+zWHAC+p2labGpqiorFYgGYCEzN7oQdQClN07O1/EfDyGgC0ALMBdYAi4FyK+4H3gLPsxfR1zRNi+NP7nH5J+QntnXe5B5mpfQAAAAASUVORK5CYII=">
+
+<!-- Google fonts -->
+<link href='https://fonts.googleapis.com/css?family=Lato:400,600,900' rel='stylesheet' type='text/css'/>
+<link href='https://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
+
+<!-- CSS -->
+<title>$title</title>
+<link rel="stylesheet" type="text/css" href="$nimdoccss">
+
+<script type="text/javascript" src="dochack.js"></script>
+
+<script type="text/javascript">
+function main() {
+  var pragmaDots = document.getElementsByClassName("pragmadots");
+  for (var i = 0; i < pragmaDots.length; i++) {
+    pragmaDots[i].onclick = function(event) {
+      // Hide tease
+      event.target.parentNode.style.display = "none";
+      // Show actual
+      event.target.parentNode.nextElementSibling.style.display = "inline";
+    }
+  }
+
+  const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+  function switchTheme(e) {
+      if (e.target.checked) {
+          document.documentElement.setAttribute('data-theme', 'dark');
+          localStorage.setItem('theme', 'dark');
+      } else {
+          document.documentElement.setAttribute('data-theme', 'light');
+          localStorage.setItem('theme', 'light');
+      }
+  }
+
+  toggleSwitch.addEventListener('change', switchTheme, false);
+
+
+  if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
+    document.documentElement.setAttribute('data-theme', "dark");
+    toggleSwitch.checked = true;
+  } else if (window.matchMedia && window.matchMedia('(prefers-color-scheme: light)').matches) {
+    document.documentElement.setAttribute('data-theme', "light");
+    toggleSwitch.checked = false;
+  } else {
+    const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+    if (currentTheme) {
+      document.documentElement.setAttribute('data-theme', currentTheme);
+
+      if (currentTheme === 'dark') {
+        toggleSwitch.checked = true;
+      }
+    }
+  }
+}
+</script>
+
+</head>
+
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>Arraymancer - $title</title>
+
+<link href="docutils.css" rel="stylesheet" type="text/css"/>
+<link href="nav.css" rel="stylesheet" type="text/css"/>
+
+<link href='http://fonts.googleapis.com/css?family=Raleway:400,600,900' rel='stylesheet' type='text/css'/>
+<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
+
+<a href="https://github.com/mratsim/arraymancer"><img style="position: fixed; top: 0; right: 0; border: 0; z-index: 10;" src="https://camo.githubusercontent.com/652c5b9acfaddf3a9c326fa6bde407b87f7be0f4/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f6f72616e67655f6666373630302e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_orange_ff7600.png"></a>
+
+<body onload="main()">
+<div class="document" id="documentId">
+  <div class="container">
+    <h1 class="title">$title</h1>
+    $content
+    <div class="row">
+      <div class="twelve-columns footer">
+        <span class="nim-sprite"></span>
+        <br/>
+        <small style="color: var(--hint);">Made with Nim. Generated: $date $time UTC</small>
+      </div>
+    </div>
+  </div>
+</div>
+$analytics
+"""
+
+const headerTmpl* = """
+<header>
+  <a class="pagetitle" href="index.html">Arraymancer</a>
+  <span>
+    <a href="#">Technical reference</a>
+    <ul class="monospace">
+      <span>
+        <a href="#">Core tensor API</a>
+        <ul class="monospace">
+          $1
+        </ul>
+      </span>
+      <span>
+        <a href="#">Neural network API</a>
+        <ul class="monospace">
+          $2
+        </ul>
+      </span>
+      <span>
+        <a href="#">Linear algebra, stats, ML</a>
+        <ul class="monospace">
+          $3
+        </ul>
+      </span>
+      <span>
+        <a href="#">IO & Datasets</a>
+        <ul class="monospace">
+          $4
+        </ul>
+      </span>
+      <span>
+        <a href="#">Autograd</a>
+        <ul class="monospace">
+          $5
+        </ul>
+      </span>
+      <span>
+        <a href="#">Neuralnet primitives</a>
+        <ul class="monospace">
+          $6
+        </ul>
+      </span>
+      <span>
+        <a href="#">Other docs</a>
+        <ul class="monospace">
+          $7
+        </ul>
+      </span>
+    </ul>
+  </span>
+  <span>
+    <a href="#">Tutorial</a>
+    <ul class="monospace">
+      <li><a href="tuto.first_steps.html">First steps</a></li>
+      <li><a href="tuto.slicing.html">Taking a slice of a tensor</a></li>
+      <li><a href="tuto.linear_algebra.html">Matrix & vectors operations</a></li>
+      <li><a href="tuto.broadcasting.html">Broadcasted operations</a></li>
+      <li><a href="tuto.shapeshifting.html">Transposing, Reshaping, Permuting, Concatenating</a></li>
+      <li><a href="tuto.map_reduce.html">Map & Reduce</a></li>
+      <li><a href="tuto.iterators.html">Basic iterators</a></li>
+    </ul>
+  </span>
+  <span>
+    <a href="#">Spellbook (How-To&apos;s)</a>
+    <ul class="monospace">
+      <li><a href="howto.type_conversion.html">How to convert a Tensor type?</a></li>
+      <li><a href="howto.ufunc.html">How to create a new universal function?</a></li>
+      <li><a href="howto.perceptron.html">How to create a multilayer perceptron?</a></li>
+    </ul>
+  </span>
+  <span>
+    <a href="#">Under the hood</a>
+    <ul class="monospace">
+      <li><a href="uth.speed.html">How Arraymancer achieves its speed?</a></li>
+      <li><a href="uth.copy_semantics.html">Why does `=` share data by default aka reference semantics?</a></li>
+      <li><a href="uth.opencl_cuda_nim.html">Working with OpenCL and Cuda in Nim</a></li>
+    </ul>
+  </span>
+</header>
+</body>
+</html>
+"""
diff --git a/nimdoc.cfg b/nimdoc.cfg
index 740838201..5d9d3f5f6 100644
--- a/nimdoc.cfg
+++ b/nimdoc.cfg
@@ -1,5 +1,4 @@
-## Arraymancer documentation generation
-# Inspiration from flyx's NimYaml: https://github.com/flyx/NimYAML
+# Arraymancer documentation generation
 
 git.url = "https://github.com/mratsim/arraymancer"
 
@@ -9,26 +8,103 @@ class="link-seesrc" target="_blank">Source</a>
 <a href="${url}/edit/master/${path}#L${line}" class="link-seesrc" target="_blank" >Edit</a>
 """
 
-# TODO: industrialize similar to Nim website: https://github.com/nim-lang/Nim/blob/e758b9408e8fe935117f7f793164f1c9b74cec06/tools/nimweb.nim#L45
-# And: https://github.com/nim-lang/Nim/blob/d3f966922ef4ddd05c137f82e5b2329b3d5dc485/web/website.ini#L31
-
-# TODO: move the technical reference to the end (need some CSS so that elements are properly placed)
-
-doc.file = """
-<!DOCTYPE html>
+doc.file = """<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!--  This file is generated by Nim. -->
 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
 <head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <title>Arraymancer - $title</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+<!-- Favicon -->
+<link rel="shortcut icon" href="data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAUAAAAF////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAIAAABbAAAAlQAAAKIAAACbAAAAmwAAAKIAAACVAAAAWwAAAAL///8A////AP///wD///8A////AAAAABQAAADAAAAAYwAAAA3///8A////AP///wD///8AAAAADQAAAGMAAADAAAAAFP///wD///8A////AP///wAAAACdAAAAOv///wD///8A////AP///wD///8A////AP///wD///8AAAAAOgAAAJ3///8A////AP///wAAAAAnAAAAcP///wAAAAAoAAAASv///wD///8A////AP///wAAAABKAAAAKP///wAAAABwAAAAJ////wD///8AAAAAgQAAABwAAACIAAAAkAAAAJMAAACtAAAAFQAAABUAAACtAAAAkwAAAJAAAACIAAAAHAAAAIH///8A////AAAAAKQAAACrAAAAaP///wD///8AAAAARQAAANIAAADSAAAARf///wD///8AAAAAaAAAAKsAAACk////AAAAADMAAACcAAAAnQAAABj///8A////AP///wAAAAAYAAAAGP///wD///8A////AAAAABgAAACdAAAAnAAAADMAAAB1AAAAwwAAAP8AAADpAAAAsQAAAE4AAAAb////AP///wAAAAAbAAAATgAAALEAAADpAAAA/wAAAMMAAAB1AAAAtwAAAOkAAAD/AAAA/wAAAP8AAADvAAAA3gAAAN4AAADeAAAA3gAAAO8AAAD/AAAA/wAAAP8AAADpAAAAtwAAAGUAAAA/AAAA3wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAADfAAAAPwAAAGX///8A////AAAAAEgAAADtAAAAvwAAAL0AAADGAAAA7wAAAO8AAADGAAAAvQAAAL8AAADtAAAASP///wD///8A////AP///wD///8AAAAAO////wD///8A////AAAAAIcAAACH////AP///wD///8AAAAAO////wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A//8AAP//AAD4HwAA7/cAAN/7AAD//wAAoYUAAJ55AACf+QAAh+EAAAAAAADAAwAA4AcAAP5/AAD//wAA//8AAA=="/>
+<link rel="icon" type="image/png" sizes="32x32" href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAA3XAAAN1wFCKJt4AAAAB3RJTUUH4QQQEwksSS9ZWwAAAk1JREFUWMPtll2ITVEUx39nn/O7Y5qR8f05wtCUUr6ZIS++8pEnkZInPImneaCQ5METNdOkeFBKUhMPRIkHKfEuUZSUlGlKPN2TrgfncpvmnntnmlEyq1Z7t89/rf9a6+y99oZxGZf/XeIq61EdtgKXgdXA0xrYAvBjOIF1AI9zvjcC74BSpndrJPkBWDScTF8Aa4E3wDlgHbASaANmVqlcCnwHvgDvgVfAJ+AikAAvgfVZwLnSVZHZaOuKoQi3ZOMi4NkYkpe1p4J7A8BpYAD49hfIy/oqG0+hLomiKP2L5L+1ubn5115S+3OAn4EnwBlgMzCjyt6ZAnQCJ4A7wOs88iRJHvw50HoujuPBoCKwHWiosy8MdfZnAdcHk8dxXFJ3VQbQlCTJvRBCGdRbD4M6uc5glpY3eAihpN5S5w12diSEcCCEcKUO4ljdr15T76ur1FDDLIQQ3qv71EdDOe3Kxj3leRXyk+pxdWnFWod6Wt2bY3de3aSuUHcPBVimHs7mK9WrmeOF6lR1o9qnzskh2ar2qm1qizpfXaPeVGdlmGN5pb09qMxz1Xb1kLqgzn1RyH7JUXW52lr5e/Kqi9qpto7V1atuUzfnARrV7jEib1T76gG2qxdGmXyiekkt1GswPTtek0aBfJp6YySGBfWg2tPQ0FAYgf1stUfdmdcjarbYJEniKIq6gY/Aw+zWHAC+p2labGpqiorFYgGYCEzN7oQdQClN07O1/EfDyGgC0ALMBdYAi4FyK+4H3gLPsxfR1zRNi+NP7nH5J+QntnXe5B5mpfQAAAAASUVORK5CYII=">
+
+<!-- Google fonts -->
+<link href='https://fonts.googleapis.com/css?family=Lato:400,600,900' rel='stylesheet' type='text/css'/>
+<link href='https://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
+
+<!-- CSS -->
+<title>$title</title>
+<link rel="stylesheet" type="text/css" href="$nimdoccss">
+
+<script type="text/javascript" src="dochack.js"></script>
+
+<script type="text/javascript">
+function main() {
+  var pragmaDots = document.getElementsByClassName("pragmadots");
+  for (var i = 0; i < pragmaDots.length; i++) {
+    pragmaDots[i].onclick = function(event) {
+      // Hide tease
+      event.target.parentNode.style.display = "none";
+      // Show actual
+      event.target.parentNode.nextElementSibling.style.display = "inline";
+    }
+  }
+
+  const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]');
+  function switchTheme(e) {
+      if (e.target.checked) {
+          document.documentElement.setAttribute('data-theme', 'dark');
+          localStorage.setItem('theme', 'dark');
+      } else {
+          document.documentElement.setAttribute('data-theme', 'light');
+          localStorage.setItem('theme', 'light');
+      }
+  }
+
+  toggleSwitch.addEventListener('change', switchTheme, false);
+
 
-  <link href="docutils.css" rel="stylesheet" type="text/css"/>
-  <link href="nav.css" rel="stylesheet" type="text/css"/>
+  if (window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches) {
+    document.documentElement.setAttribute('data-theme', "dark");
+    toggleSwitch.checked = true;
+  } else if (window.matchMedia && window.matchMedia('(prefers-color-scheme: light)').matches) {
+    document.documentElement.setAttribute('data-theme', "light");
+    toggleSwitch.checked = false;
+  } else {
+    const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null;
+    if (currentTheme) {
+      document.documentElement.setAttribute('data-theme', currentTheme);
+
+      if (currentTheme === 'dark') {
+        toggleSwitch.checked = true;
+      }
+    }
+  }
+}
+</script>
 
-  <link href='http://fonts.googleapis.com/css?family=Raleway:400,600,900' rel='stylesheet' type='text/css'/>
-  <link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
 </head>
-<body>
+
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<title>Arraymancer - $title</title>
+
+<link href="docutils.css" rel="stylesheet" type="text/css"/>
+<link href="nav.css" rel="stylesheet" type="text/css"/>
+
+<link href='http://fonts.googleapis.com/css?family=Raleway:400,600,900' rel='stylesheet' type='text/css'/>
+<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
+
 <a href="https://github.com/mratsim/arraymancer"><img style="position: fixed; top: 0; right: 0; border: 0; z-index: 10;" src="https://camo.githubusercontent.com/652c5b9acfaddf3a9c326fa6bde407b87f7be0f4/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f6f72616e67655f6666373630302e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_orange_ff7600.png"></a>
+
+<body onload="main()">
+<div class="document" id="documentId">
+  <div class="container">
+    <h1 class="title">$title</h1>
+    $content
+    <div class="row">
+      <div class="twelve-columns footer">
+        <span class="nim-sprite"></span>
+        <br/>
+        <small style="color: var(--hint);">Made with Nim. Generated: $date $time UTC</small>
+      </div>
+    </div>
+  </div>
+</div>
+$analytics
 <header>
   <a class="pagetitle" href="index.html">Arraymancer</a>
   <span>
@@ -37,115 +113,206 @@ doc.file = """
       <span>
         <a href="#">Core tensor API</a>
         <ul class="monospace">
-          <li><a href="tensor.accessors_macros_read.html">tensor.accessors_macros_read</a></li>
-          <li><a href="tensor.accessors_macros_syntax.html">tensor.accessors_macros_syntax</a></li>
-          <li><a href="tensor.accessors_macros_write.html">tensor.accessors_macros_write</a></li>
-          <li><a href="tensor.accessors.html">tensor.accessors</a></li>
-          <li><a href="tensor.aggregate.html">tensor.aggregate</a></li>
-          <li><a href="tensor.comparison.html">tensor.comparison</a></li>
-          <li><a href="tensor.data_structure.html">tensor.data_structure</a></li>
-          <li><a href="tensor.display.html">tensor.display</a></li>
-          <li><a href="tensor.display_cuda.html">tensor.display_cuda</a></li>
-          <li><a href="tensor.exporting.html">tensor.exporting</a></li>
-          <li><a href="tensor.filling_data.html">tensor.filling_data</a></li>
-          <li><a href="tensor.higher_order_applymap.html">tensor.higher_order_applymap</a></li>
-          <li><a href="tensor.higher_order_foldreduce.html">tensor.higher_order_foldreduce</a></li>
-          <li><a href="tensor.init_cpu.html">tensor.init_cpu</a></li>
-          <li><a href="tensor.init_cuda.html">tensor.init_cuda</a></li>
-          <li><a href="tensor.init_opencl.html">tensor.init_opencl</a></li>
-          <li><a href="tensor.init_copy_cpu.html">tensor.init_copy_cpu</a></li>
-          <li><a href="tensor.init_copy_cuda.html">tensor.init_copy_cuda</a></li>
-          <li><a href="tensor.lapack.html">tensor.lapack</a></li>
-          <li><a href="tensor.math_functions.html">tensor.math_functions</a></li>
-          <li><a href="tensor.operators_blas_l1.html">tensor.operators_blas_l1</a></li>
-          <li><a href="tensor.operators_blas_l1_cuda.html">tensor.operators_blas_l1_cuda</a></li>
-          <li><a href="tensor.operators_blas_l1_opencl.html">tensor.operators_blas_l1_opencl</a></li>
-          <li><a href="tensor.operators_blas_l2l3.html">tensor.operators_blas_l2l3</a></li>
-          <li><a href="tensor.operators_blas_l2l3_cuda.html">tensor.operators_blas_l2l3_cuda</a></li>
-          <li><a href="tensor.operators_blas_l2l3_opencl.html">tensor.operators_blas_l2l3_opencl</a></li>
-          <li><a href="tensor.operators_broadcasted.html">tensor.operators_broadcasted</a></li>
-          <li><a href="tensor.operators_broadcasted_cuda.html">tensor.operators_broadcasted_cuda</a></li>
-          <li><a href="tensor.operators_broadcasted_opencl.html">tensor.operators_broadcasted_opencl</a></li>
-          <li><a href="tensor.operators_comparison.html">tensor.operators_comparison</a></li>
-          <li><a href="tensor.operators_logical.html">tensor.operators_logical</a></li>
-          <li><a href="tensor.optim_ops_fusion.html">tensor.optim_ops_fusion</a></li>
-          <li><a href="tensor.shapeshifting.html">tensor.shapeshifting</a></li>
-          <li><a href="tensor.shapeshifting_cuda.html">tensor.shapeshifting_cuda</a></li>
-          <li><a href="tensor.shapeshifting_opencl.html">tensor.shapeshifting_opencl</a></li>
-          <li><a href="tensor.syntactic_sugar.html">tensor.syntactic_sugar</a></li>
-          <li><a href="tensor.ufunc.html">tensor.ufunc</a></li>
+          <li><a href="accessors.html">accessors</a></li>
+<li><a href="accessors_macros_read.html">accessors_macros_read</a></li>
+<li><a href="accessors_macros_syntax.html">accessors_macros_syntax</a></li>
+<li><a href="accessors_macros_write.html">accessors_macros_write</a></li>
+<li><a href="aggregate.html">aggregate</a></li>
+<li><a href="blas_l3_gemm.html">blas_l3_gemm</a></li>
+<li><a href="cublas.html">cublas</a></li>
+<li><a href="cuda.html">cuda</a></li>
+<li><a href="cuda_global_state.html">cuda_global_state</a></li>
+<li><a href="data_structure.html">data_structure</a></li>
+<li><a href="display.html">display</a></li>
+<li><a href="display_cuda.html">display_cuda</a></li>
+<li><a href="einsum.html">einsum</a></li>
+<li><a href="exporting.html">exporting</a></li>
+<li><a href="filling_data.html">filling_data</a></li>
+<li><a href="higher_order_applymap.html">higher_order_applymap</a></li>
+<li><a href="higher_order_foldreduce.html">higher_order_foldreduce</a></li>
+<li><a href="incl_accessors_cuda.html">incl_accessors_cuda</a></li>
+<li><a href="incl_higher_order_cuda.html">incl_higher_order_cuda</a></li>
+<li><a href="incl_kernels_cuda.html">incl_kernels_cuda</a></li>
+<li><a href="init_copy_cpu.html">init_copy_cpu</a></li>
+<li><a href="init_copy_cuda.html">init_copy_cuda</a></li>
+<li><a href="init_cpu.html">init_cpu</a></li>
+<li><a href="init_cuda.html">init_cuda</a></li>
+<li><a href="init_opencl.html">init_opencl</a></li>
+<li><a href="lapack.html">lapack</a></li>
+<li><a href="math_functions.html">math_functions</a></li>
+<li><a href="memory_optimization_hints.html">memory_optimization_hints</a></li>
+<li><a href="metadataArray.html">metadataArray</a></li>
+<li><a href="naive_l2_gemv.html">naive_l2_gemv</a></li>
+<li><a href="opencl_backend.html">opencl_backend</a></li>
+<li><a href="opencl_global_state.html">opencl_global_state</a></li>
+<li><a href="openmp.html">openmp</a></li>
+<li><a href="operators_blas_l1.html">operators_blas_l1</a></li>
+<li><a href="operators_blas_l1_cuda.html">operators_blas_l1_cuda</a></li>
+<li><a href="operators_blas_l1_opencl.html">operators_blas_l1_opencl</a></li>
+<li><a href="operators_blas_l2l3.html">operators_blas_l2l3</a></li>
+<li><a href="operators_blas_l2l3_cuda.html">operators_blas_l2l3_cuda</a></li>
+<li><a href="operators_blas_l2l3_opencl.html">operators_blas_l2l3_opencl</a></li>
+<li><a href="operators_broadcasted.html">operators_broadcasted</a></li>
+<li><a href="operators_broadcasted_cuda.html">operators_broadcasted_cuda</a></li>
+<li><a href="operators_broadcasted_opencl.html">operators_broadcasted_opencl</a></li>
+<li><a href="operators_comparison.html">operators_comparison</a></li>
+<li><a href="operators_logical.html">operators_logical</a></li>
+<li><a href="optim_ops_fusion.html">optim_ops_fusion</a></li>
+<li><a href="p_accessors.html">p_accessors</a></li>
+<li><a href="p_accessors_macros_desugar.html">p_accessors_macros_desugar</a></li>
+<li><a href="p_accessors_macros_read.html">p_accessors_macros_read</a></li>
+<li><a href="p_accessors_macros_write.html">p_accessors_macros_write</a></li>
+<li><a href="p_checks.html">p_checks</a></li>
+<li><a href="p_complex.html">p_complex</a></li>
+<li><a href="p_display.html">p_display</a></li>
+<li><a href="p_init_cpu.html">p_init_cpu</a></li>
+<li><a href="p_init_cuda.html">p_init_cuda</a></li>
+<li><a href="p_init_opencl.html">p_init_opencl</a></li>
+<li><a href="p_kernels_interface_cuda.html">p_kernels_interface_cuda</a></li>
+<li><a href="p_kernels_interface_opencl.html">p_kernels_interface_opencl</a></li>
+<li><a href="p_operator_blas_l2l3.html">p_operator_blas_l2l3</a></li>
+<li><a href="p_shapeshifting.html">p_shapeshifting</a></li>
+<li><a href="shapeshifting.html">shapeshifting</a></li>
+<li><a href="shapeshifting_cuda.html">shapeshifting_cuda</a></li>
+<li><a href="shapeshifting_opencl.html">shapeshifting_opencl</a></li>
+<li><a href="syntactic_sugar.html">syntactic_sugar</a></li>
+<li><a href="tensor.html">tensor</a></li>
+<li><a href="tensor_cuda.html">tensor_cuda</a></li>
+<li><a href="tensor_opencl.html">tensor_opencl</a></li>
+<li><a href="ufunc.html">ufunc</a></li>
         </ul>
       </span>
       <span>
         <a href="#">Neural network API</a>
         <ul class="monospace">
-          <li><a href="nn_dsl.dsl_core.html">Neural network: Declaration</a></li>
-          <li><a href="nn_activation.relu.html">Activation: Relu (Rectified linear Unit)</a></li>
-          <li><a href="nn_activation.sigmoid.html">Activation: Sigmoid</a></li>
-          <li><a href="nn_activation.tanh.html">Activation: Tanh</a></li>
-          <li><a href="nn_layers.conv2D.html">Layers: Convolution 2D</a></li>
-          <li><a href="nn_layers.embedding.html">Layers: Embedding</a></li>
-          <li><a href="nn_layers.gru.html">Layers: GRU (Gated Linear Unit)</a></li>
-          <li><a href="nn_layers.linear.html">Layers: Linear/Dense</a></li>
-          <li><a href="nn_layers.maxpool2D.html">Layers: Maxpool 2D</a></li>
-          <li><a href="nn_loss.cross_entropy_losses.html">Loss: Cross-Entropy losses</a></li>
-          <li><a href="nn_loss.mean_square_error_loss.html">Loss: Mean Square Error</a></li>
-          <li><a href="nn_optimizers.optimizers.html">Optimizers</a></li>
-          <li><a href="nn_shapeshifting.reshape_flatten.html">Reshape & Flatten</a></li>
+          <li><a href="conv2D.html">Layers: Convolution 2D</a></li>
+<li><a href="cross_entropy_losses.html">Loss: Cross-Entropy losses</a></li>
+<li><a href="dsl_core.html">Neural network: Declaration</a></li>
+<li><a href="dsl_forwardsugar.html">dsl_forwardsugar</a></li>
+<li><a href="dsl_initialization.html">dsl_initialization</a></li>
+<li><a href="dsl_topology.html">dsl_topology</a></li>
+<li><a href="dsl_types.html">dsl_types</a></li>
+<li><a href="dsl_utils.html">dsl_utils</a></li>
+<li><a href="embedding.html">Layers: Embedding</a></li>
+<li><a href="gru.html">Layers: GRU (Gated Linear Unit)</a></li>
+<li><a href="init.html">Layers: Initializations</a></li>
+<li><a href="linear.html">Layers: Linear/Dense</a></li>
+<li><a href="maxpool2D.html">Layers: Maxpool 2D</a></li>
+<li><a href="mean_square_error_loss.html">Loss: Mean Square Error</a></li>
+<li><a href="optimizers.html">Optimizers</a></li>
+<li><a href="relu.html">Activation: Relu (Rectified linear Unit)</a></li>
+<li><a href="sigmoid.html">Activation: Sigmoid</a></li>
+<li><a href="softmax.html">Softmax</a></li>
+<li><a href="tanh.html">Activation: Tanh</a></li>
         </ul>
       </span>
       <span>
         <a href="#">Linear algebra, stats, ML</a>
         <ul class="monospace">
-          <li><a href="la.decomposition.html">Eigenvalue decomposition</a></li>
-          <li><a href="la.decomposition_rand.html">Randomized Truncated SVD</a></li>
-          <li><a href="la.least_squares.html">Least squares solver</a></li>
-          <li><a href="la.linear_systems.html">Linear systems solver</a></li>
-          <li><a href="la.special_matrices.html">Special linear algebra matrices</a></li>
-          <li><a href="stats.stats.html">Statistics</a></li>
-          <li><a href="ml.pca.html">Principal Component Analysis (PCA)</a></li>
-          <li><a href="ml.accuracy_score.html">Accuracy score</a></li>
-          <li><a href="ml.common_error_functions.html">Common errors, MAE and MSE (L1, L2 loss)</a></li>
-          <li><a href="ml.kmeans.html">K-Means</a></li>
+          <li><a href="accuracy_score.html">Accuracy score</a></li>
+<li><a href="auxiliary_blas.html">auxiliary_blas</a></li>
+<li><a href="auxiliary_lapack.html">auxiliary_lapack</a></li>
+<li><a href="common_error_functions.html">Common errors, MAE and MSE (L1, L2 loss)</a></li>
+<li><a href="decomposition.html">Eigenvalue decomposition</a></li>
+<li><a href="decomposition_lapack.html">decomposition_lapack</a></li>
+<li><a href="decomposition_rand.html">Randomized Truncated SVD</a></li>
+<li><a href="init_colmajor.html">init_colmajor</a></li>
+<li><a href="kmeans.html">K-Means</a></li>
+<li><a href="least_squares.html">Least squares solver</a></li>
+<li><a href="least_squares_lapack.html">least_squares_lapack</a></li>
+<li><a href="linear_algebra.html">linear_algebra</a></li>
+<li><a href="linear_systems.html">Linear systems solver</a></li>
+<li><a href="overload.html">overload</a></li>
+<li><a href="pca.html">Principal Component Analysis (PCA)</a></li>
+<li><a href="solve_lapack.html">solve_lapack</a></li>
+<li><a href="special_matrices.html">Special linear algebra matrices</a></li>
+<li><a href="stats.html">Statistics</a></li>
+<li><a href="triangular.html">triangular</a></li>
         </ul>
       </span>
       <span>
         <a href="#">IO & Datasets</a>
         <ul class="monospace">
-          <li><a href="datasets.mnist.html">MNIST</a></li>
-          <li><a href="datasets.imdb.html">IMDB</a></li>
-          <li><a href="io.io_csv.html">CSV reading and writing</a></li>
-          <li><a href="io.io_hdf5.html">HDF5 files reading and writing</a></li>
-          <li><a href="io.io_image.html">Images reading and writing</a></li>
-          <li><a href="io.io_npy.html">Numpy files reading and writing</a></li>
+          <li><a href="imdb.html">IMDB</a></li>
+<li><a href="io_csv.html">CSV reading and writing</a></li>
+<li><a href="io_hdf5.html">HDF5 files reading and writing</a></li>
+<li><a href="io_image.html">Images reading and writing</a></li>
+<li><a href="io_npy.html">Numpy files reading and writing</a></li>
+<li><a href="io_stream_readers.html">io_stream_readers</a></li>
+<li><a href="mnist.html">MNIST</a></li>
+<li><a href="util.html">util</a></li>
         </ul>
       </span>
       <span>
         <a href="#">Autograd</a>
         <ul class="monospace">
-          <li><a href="ag.autograd_common.html">Data structure</a></li>
-          <li><a href="ag.gates_basic.html">Basic operations</a></li>
-          <li><a href="ag.gates_blas.html">Linear algebra operations</a></li>
-          <li><a href="ag.gates_hadamard.html">Hadamard product (elementwise matrix multiply)</a></li>
-          <li><a href="ag.gates_reduce.html">Reduction operations</a></li>
-          <li><a href="ag.gates_shapeshifting_concat_split.html">Concatenation, stacking, splitting, chunking operations</a></li>
-          <li><a href="ag.gates_shapeshifting_views.html">Linear algebra operations</a></li>
+          <li><a href="autograd_common.html">Data structure</a></li>
+<li><a href="gates_basic.html">Basic operations</a></li>
+<li><a href="gates_blas.html">Linear algebra operations</a></li>
+<li><a href="gates_hadamard.html">Hadamard product (elementwise matrix multiply)</a></li>
+<li><a href="gates_reduce.html">Reduction operations</a></li>
+<li><a href="gates_shapeshifting_concat_split.html">Concatenation, stacking, splitting, chunking operations</a></li>
+<li><a href="gates_shapeshifting_views.html">Linear algebra operations</a></li>
         </ul>
       </span>
       <span>
         <a href="#">Neuralnet primitives</a>
         <ul class="monospace">
-          <li><a href="nnp.nnp_activation.html">Activations</a></li>
-          <li><a href="nnp.nnp_convolution.html">Convolution 2D</a></li>
-          <li><a href="nnp.nnp_conv2D_cudnn.html">Convolution 2D - CuDNN</a></li>
-          <li><a href="nnp.nnp_embedding.html">Embeddings</a></li>
-          <li><a href="nnp.nnp_gru.html">Gated Recurrent Unit (GRU)</a></li>
-          <li><a href="nnp.nnp_linear.html">Linear / Dense layer</a></li>
-          <li><a href="nnp.nnp_maxpooling.html">Maxpooling</a></li>
-          <li><a href="nnp.nnp_numerical_gradient.html">Numerical gradient</a></li>
-          <li><a href="nnp.nnp_sigmoid_cross_entropy.html">Sigmoid Cross-Entropy loss</a></li>
-          <li><a href="nnp.nnp_softmax_cross_entropy.html">Softmax Cross-Entropy loss</a></li>
-          <li><a href="nnp.nnp_softmax.html">Softmax</a></li>
+          <li><a href="conv.html">conv</a></li>
+<li><a href="cudnn.html">cudnn</a></li>
+<li><a href="cudnn_conv_interface.html">cudnn_conv_interface</a></li>
+<li><a href="nn_primitives.html">nn_primitives</a></li>
+<li><a href="nnp_activation.html">Activations</a></li>
+<li><a href="nnp_conv2d_cudnn.html">Convolution 2D - CuDNN</a></li>
+<li><a href="nnp_convolution.html">Convolution 2D</a></li>
+<li><a href="nnp_embedding.html">Embeddings</a></li>
+<li><a href="nnp_gru.html">Gated Recurrent Unit (GRU)</a></li>
+<li><a href="nnp_linear.html">Linear / Dense layer</a></li>
+<li><a href="nnp_maxpooling.html">Maxpooling</a></li>
+<li><a href="nnp_numerical_gradient.html">Numerical gradient</a></li>
+<li><a href="nnp_sigmoid_cross_entropy.html">Sigmoid Cross-Entropy loss</a></li>
+<li><a href="nnp_softmax.html">Softmax</a></li>
+<li><a href="nnp_softmax_cross_entropy.html">Softmax Cross-Entropy loss</a></li>
+<li><a href="nnpack.html">nnpack</a></li>
+<li><a href="nnpack_interface.html">nnpack_interface</a></li>
+<li><a href="p_activation.html">p_activation</a></li>
+<li><a href="p_logsumexp.html">p_logsumexp</a></li>
+<li><a href="p_nnp_checks.html">p_nnp_checks</a></li>
+<li><a href="p_nnp_types.html">p_nnp_types</a></li>
+        </ul>
+      </span>
+      <span>
+        <a href="#">Other docs</a>
+        <ul class="monospace">
+          <li><a href="align_unroller.html">align_unroller</a></li>
+<li><a href="ast_utils.html">ast_utils</a></li>
+<li><a href="compiler_optim_hints.html">compiler_optim_hints</a></li>
+<li><a href="cpuinfo_x86.html">cpuinfo_x86</a></li>
+<li><a href="functional.html">functional</a></li>
+<li><a href="gemm.html">gemm</a></li>
+<li><a href="gemm_packing.html">gemm_packing</a></li>
+<li><a href="gemm_prepacked.html">gemm_prepacked</a></li>
+<li><a href="gemm_tiling.html">gemm_tiling</a></li>
+<li><a href="gemm_ukernel_avx.html">gemm_ukernel_avx</a></li>
+<li><a href="gemm_ukernel_avx2.html">gemm_ukernel_avx2</a></li>
+<li><a href="gemm_ukernel_avx512.html">gemm_ukernel_avx512</a></li>
+<li><a href="gemm_ukernel_avx_fma.html">gemm_ukernel_avx_fma</a></li>
+<li><a href="gemm_ukernel_dispatch.html">gemm_ukernel_dispatch</a></li>
+<li><a href="gemm_ukernel_generator.html">gemm_ukernel_generator</a></li>
+<li><a href="gemm_ukernel_generic.html">gemm_ukernel_generic</a></li>
+<li><a href="gemm_ukernel_sse.html">gemm_ukernel_sse</a></li>
+<li><a href="gemm_ukernel_sse2.html">gemm_ukernel_sse2</a></li>
+<li><a href="gemm_ukernel_sse4_1.html">gemm_ukernel_sse4_1</a></li>
+<li><a href="gemm_utils.html">gemm_utils</a></li>
+<li><a href="global_config.html">global_config</a></li>
+<li><a href="math_ops_fusion.html">math_ops_fusion</a></li>
+<li><a href="memory.html">memory</a></li>
+<li><a href="nested_containers.html">nested_containers</a></li>
+<li><a href="nlp.html">nlp</a></li>
+<li><a href="openmp.html">openmp</a></li>
+<li><a href="sequninit.html">sequninit</a></li>
+<li><a href="simd.html">simd</a></li>
+<li><a href="tokenizers.html">tokenizers</a></li>
         </ul>
       </span>
     </ul>
@@ -179,19 +346,6 @@ doc.file = """
     </ul>
   </span>
 </header>
-<article id="documentId">
-  <div class="container">
-    <h1 class="title">$title</h1>
-    $content
-    <div class="row">
-      <div class="twelve-columns footer">
-        <span class="nim-sprite"></span>
-        <br/>
-        <small>Made with Nim. Generated: $date $time UTC</small>
-      </div>
-    </div>
-  </div>
-</article>
 </body>
 </html>
 """
diff --git a/src/laser/primitives/matrix_multiplication/gemm_prepacked.nim b/src/laser/primitives/matrix_multiplication/gemm_prepacked.nim
index 1f3a90584..9e8338d4c 100644
--- a/src/laser/primitives/matrix_multiplication/gemm_prepacked.nim
+++ b/src/laser/primitives/matrix_multiplication/gemm_prepacked.nim
@@ -298,8 +298,9 @@ proc gemm_packed*[T: SomeNumber](
 #
 # ############################################################
 
-when isMainModule:
-
+when false:
+  ## these tests don't work in arraymancer, since the imported files are not
+  ## part of arraymancer's repository.
   import
     ../../tensor/[allocator, datatypes, initialization],
     strformat
diff --git a/src/laser/primitives/matrix_multiplication/gemm_tiling.nim b/src/laser/primitives/matrix_multiplication/gemm_tiling.nim
index 0ccf3ecf4..a29cd3177 100644
--- a/src/laser/primitives/matrix_multiplication/gemm_tiling.nim
+++ b/src/laser/primitives/matrix_multiplication/gemm_tiling.nim
@@ -218,7 +218,7 @@ func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKe
   result.nb_vecs_nr = NbVecs[cpu]           # SIMD vectors of B
   result.nr = result.nb_vecs_nr * result.nb_scalars
 
-#############################################
+# #############################################
 # Workaround "undeclared identifier mr or nr"
 # for some reason the compiler cannot access fields in
 # the static MicroKernel.
diff --git a/src/laser/primitives/matrix_multiplication/gemm_ukernel_generator.nim b/src/laser/primitives/matrix_multiplication/gemm_ukernel_generator.nim
index f6dc50058..a99afe5a0 100644
--- a/src/laser/primitives/matrix_multiplication/gemm_ukernel_generator.nim
+++ b/src/laser/primitives/matrix_multiplication/gemm_ukernel_generator.nim
@@ -188,10 +188,10 @@ macro ukernel_simd_impl*(
     var declBody = newStmtList()
     for a in rA:
       declBody.add quote do:
-        var `a`{.noinit.}: `V`
+        var `a`{.noInit.}: `V`
     for b in rB:
       declBody.add quote do:
-        var `b`{.noinit.}: `V`
+        var `b`{.noInit.}: `V`
     for i in 0 ..< MR:
       for j in 0 ..< NbVecs:
         let ab = rAB[i][j]
diff --git a/src/laser/primitives/matrix_multiplication/gemm_ukernel_sse2.nim b/src/laser/primitives/matrix_multiplication/gemm_ukernel_sse2.nim
index c6f844d5a..2ec32e034 100644
--- a/src/laser/primitives/matrix_multiplication/gemm_ukernel_sse2.nim
+++ b/src/laser/primitives/matrix_multiplication/gemm_ukernel_sse2.nim
@@ -25,11 +25,11 @@ ukernel_generator(
       simd_fma = float64x2_muladd_unfused
     )
 
-#######################################
+# #######################################
 #
 # Int32: hack to unroll scalar code
 #
-#######################################
+# #######################################
 
 # This is faster than using the fallback for mm_mullo_epi32
 # in laser/primitives/private/sse2_utils
@@ -80,11 +80,11 @@ ukernel_generator(
     )
 
 
-#######################################
+# #######################################
 #
 # Int64: hack to unroll scalar code
 #
-#######################################
+# #######################################
 
 type Int64x2 = array[2, int64]
 
diff --git a/src/linear_algebra/helpers/auxiliary_lapack.nim b/src/linear_algebra/helpers/auxiliary_lapack.nim
index 8ad74c913..e198105bd 100644
--- a/src/linear_algebra/helpers/auxiliary_lapack.nim
+++ b/src/linear_algebra/helpers/auxiliary_lapack.nim
@@ -167,6 +167,7 @@ proc ormqr*[T: SomeFloat](C: var Tensor[T], Q: Tensor[T], tau: openarray[T], sid
 when isMainModule:
   import ./decomposition_lapack
   import ../../ml/metrics/common_error_functions
+  import ../../private/sequninit
 
   let a = [[12.0, -51.0, 4.0],
           [ 6.0, 167.0, -68.0],
diff --git a/src/linear_algebra/helpers/solve_lapack.nim b/src/linear_algebra/helpers/solve_lapack.nim
index 68ab1198c..3d38566c5 100644
--- a/src/linear_algebra/helpers/solve_lapack.nim
+++ b/src/linear_algebra/helpers/solve_lapack.nim
@@ -7,15 +7,15 @@ import
   ./overload,
   ../../tensor/tensor
 
-# Wrappers for Fortran LAPACK linear equation driver routines *SV
-# Currently only *GESV is wrapped
+# Wrappers for Fortran LAPACK linear equation driver routines `*SV`
+# Currently only `*GESV` is wrapped
 # TODO: Implement GBSV, GTSV, POSV, PBSV, PTSV, SYSV
 
 overload(gesv, sgesv)
 overload(gesv, dgesv)
 
 proc gesv*[T: SomeFloat](a, b: var Tensor[T], pivot_indices: var seq[int32]) =
-  ## Wrapper for LAPACK *gesv routines
+  ## Wrapper for LAPACK `*gesv` routines
   ## Solve AX = B for general matrix
   ##
   ## In-place version, this will overwrite a and b
diff --git a/src/nn_primitives/backend/cudnn.nim b/src/nn_primitives/backend/cudnn.nim
index 211f8f49b..a57aa7a10 100644
--- a/src/nn_primitives/backend/cudnn.nim
+++ b/src/nn_primitives/backend/cudnn.nim
@@ -56,7 +56,7 @@ template asCudnnType*[T: SomeFloat](typ: typedesc[T]): cudnnDataType_t =
 # #####################################################################
 # Tensor descriptor
 
-proc newCudnn4DTensorDesc*[T: SomeFloat](t: CudaTensor[T]): cudnnTensorDescriptor_t {.inline, noinit.}=
+proc newCudnn4DTensorDesc*[T: SomeFloat](t: CudaTensor[T]): cudnnTensorDescriptor_t {.inline, noInit.}=
   # TODO: destroy descriptor automatically
   # TODO: generalize with the NDTensor Desc
   check cudnnCreateTensorDescriptor(result.addr)
diff --git a/src/nn_primitives/backend/nnpack_interface.nim b/src/nn_primitives/backend/nnpack_interface.nim
index 1fd8fdf08..2defc6532 100644
--- a/src/nn_primitives/backend/nnpack_interface.nim
+++ b/src/nn_primitives/backend/nnpack_interface.nim
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ../../tensor/tensor, ../types
+import ../../tensor/tensor, ../private/p_nnp_types
 import ./nnpack
 
 proc nnpack_conv2d*(input, weight, bias: Tensor[float32], padding, stride: Size2D): Tensor[float32] {.noInit.}= # TODO use a single convention, return value or var result
diff --git a/src/nn_primitives/nnp_maxpooling.nim b/src/nn_primitives/nnp_maxpooling.nim
index a6ff1e5d6..654d42a4c 100644
--- a/src/nn_primitives/nnp_maxpooling.nim
+++ b/src/nn_primitives/nnp_maxpooling.nim
@@ -21,7 +21,7 @@ proc maxpool2d*[T](input: Tensor[T],
                 kernel: Size2D,
                 padding: Size2D = (0,0),
                 stride: Size2D = (1,1)
-                ): tuple[max_indices: Tensor[int], maxpooled: Tensor[T]] {.noinit.}=
+                ): tuple[max_indices: Tensor[int], maxpooled: Tensor[T]] {.noInit.}=
   ## MaxPool 2D forward pass
 
   assert input.rank == 4 and input.is_C_contiguous
diff --git a/src/tensor/einsum.nim b/src/tensor/einsum.nim
index b414a407a..ad7cf6a37 100644
--- a/src/tensor/einsum.nim
+++ b/src/tensor/einsum.nim
@@ -4,163 +4,162 @@ import ./shapeshifting
   # Note: importing shapeshifting_cuda will trigger a Nim inference bug
   #       in genContiguous with no workaround
 
-#[
-This module provides Einstein summation for an arbitrary number of tensors.
+## This module provides Einstein summation for an arbitrary number of tensors.
+##
+## Einstein summation describes a special application of
+## `index notation <https://en.wikipedia.org/wiki/Index_notation>`_
+## in which indices that appear more than once are implicitly summed over.
+## This allows for a concise notation of many vector / matrix / tensor calculations,
+## while exactly representing the required calculation.
+##
+## In general Einstein summation is a subset of
+## `Ricci calculus <https://en.wikipedia.org/wiki/Ricci_calculus>`_.
+##
+## The implementation of `einsum` in different languages however, typically goes
+## above and beyond actual Einstein summation, allowing for many aspects of
+## Ricci calculus.
+##
+## Simple Einstein summation examples
+## ==================================
+##
+## Typical examples include matrix-vector multiplcation, matrix-matrix multiplication
+## or the cross product. The examples below use the `einsum` / notation for the
+## elements of tensors, namely `m[i,j]` for element `i,j` of the matrix ``m``, instead of
+## the more mathematical notation `m_ij`.
+##
+## Matrix-vector multiplication
+## ----------------------------
+##
+## Let ``m`` be an `NxM` matrix and ``v`` a `M` vector. Then matrix-vector multiplication
+## `m * v` is defined as:
+## `w[i] = \sum_j m[i,j] * v[j]`.
+## The result is an `N` vector ``w`` consisting of elements `w[i]`.
+## Since `j` appears twice on the RHS of the equation, Einstein summation implies that
+## the sum over `j` is implicit, hence we can write:
+##
+## `w[i] = m[i,j] * v[j]`.
+##
+## Matrix-matrix multiplication
+## ----------------------------
+##
+## The same can be applied to matrix-matrix multiplication. Let ``m``, ``n`` be two
+## compatible matrices (both `NxN` or `NxM` and `MxN`) with elements `m[i,j]` and
+## `n[i,j]`. Matrix-matrix multiplication is defined as
+##
+## `a[i,k] = \sum_j m[i,j] * n[j,k]`
+##
+## and thus in Einstein summation:
+##
+## `a[i,k] = m[i,j] * n[j,k]`.
+##
+## Cross-product of two vectors
+## ----------------------------
+##
+## The cross product of two 3 vectors ``v``, ``w`` can be conveniently defined using
+## the `Levi-Civita symbol <https://en.wikipedia.org/wiki/Levi-Civita_symbol#Three_dimensions>`_
+## `\epsilon_{ijk}`:
+##
+## `a[i] = \epsilon_{ijk} v[j] * w[k]`,
+##
+## which implies `j` and `k` are summed over, while `i` is kept for the resulting tensor.
+##
+## More complex examples
+## =====================
+##
+## In this implementation of `einsum` (similar to other `einsum` implementations),
+## it's also possible to explicitly keep different dimensions of the multiplied
+## tensors or even perform calculations without a single index appearing mutliple
+## times, for instance to transpose a tensor. For these cases the explicit form
+## of the `einsum` macro has to be used, see below.
+##
+## Transposition of a matrix
+## -------------------------
+##
+## Transposition of a matrix can be expressed in index notation simply as an
+## exchange of indices, namely let ``m`` be an `NxM` matrix, the transposed
+## `MxN` matrix ``m^T`` is written as:
+##
+## `m[j,i] = m[i,j]`.
+##
+## Hadamard product
+## ----------------
+##
+## The Hadamard product defines the product of two `NxM` matrices ``n``, ``m``
+## in which the matrices are multiplied element wise. It is a good example
+## of the extension of `einsum` over standard Einstein summation:
+##
+## `a[i,j] = m[i,j] * n[i,j]`.
+##
+## Naive Einstein summation would demand a sum over both `i` and `j`, resulting
+## in a scalar on the LHS instead of another `NxM` matrix.
+##
+## Contracting a whole matrix
+## --------------------------
+##
+## Contraction of a full matrix describes summing all elements of a matrix
+## ``m``, resulting in a scalar `a`. It is expressed by:
+##
+## `a = m[i,i]`.
+##
+## The `einsum` macro
+## ==================
+##
+## The `einsum` macro provides two different usage paradigms.
+## * implicit <- normal Einstein summation
+## * explicit <- potential extended Einstein summation
+##
+## The macro takes a `varargs[Tensor]` and a single statement. It
+## returns a `Tensor[T]`, where `T` is deduced from the subtype of the
+## given tensors, if the result is not a scalar. For a scalar result
+## the return value is of type `T`. Note that the type of all given tensors
+## must match!
+##
+## The statement given to the macro is just a single line making use of
+## Einstein summation as in all the examples above. As a matter of fact
+## all examples above are valid statements for the `einsum` macro!
+##
+## Of course only tensors, which are given to the macro in the `varargs`
+## may be used in the statement.
+##
+## If only the `RHS` of the examples above are given, the required indices
+## for the resulting tensor are automatically calculated using pure Einstein
+## summation. Assuming `a`, `b` are two 2D arraymancer tensors , we could
+## express their matrix mutliplcation as
+##
+## .. code:: nim
+##    let c = einsum(a, b):
+##      a[i,j] * b[j,k]
+##
+## Of course the same can be written in explicit form:
+##
+## .. code:: nim
+##    let c = einsum(a, b):
+##      c[i,k] = a[i,j] * b[j,k]
+##
+## A few things must be noted here for the explicit case:
+## * the indices on the LHS are taken as "the truth"! Any index appearing here
+##   will ``not`` be summed over.
+## * the order on the LHS is taken into account, allowing for transposing
+##   dimensions.
+## * the identifier used on the LHS is arbitrary. It can match what the user assigns
+##   to, but need not.
+##
+## For many more examples for typical applications, take a look at the test case
+## `<../../tests/tensor/test_einsum.nim>`_.
+##
+## Implementation details
+## ----------------------
+##
+## The macro calculates, which indices must be contracted and which remain in the
+## final tensor. For each appearing index (of either case) we create a for loop,
+## while the contracting for loops appear within the non contracting indices.
+##
+## The macro creates a `block`, in which the code is produced and returns the
+## temporary tensor used in it.
+##
+## It also forces the tensors into contiguous, row major form by creating
+## local copies with `asContiguous`.
 
-Einstein summation describes a special application of
-`index notation <https://en.wikipedia.org/wiki/Index_notation>`_
-in which indices that appear more than once are implicitly summed over.
-This allows for a concise notation of many vector / matrix / tensor calculations,
-while exactly representing the required calculation.
-
-In general Einstein summation is a subset of
-`Ricci calculus <https://en.wikipedia.org/wiki/Ricci_calculus>`_.
-
-The implementation of `einsum` in different languages however, typically goes
-above and beyond actual Einstein summation, allowing for many aspects of
-Ricci calculus.
-
-Simple Einstein summation examples
-==================================
-
-Typical examples include matrix-vector multiplcation, matrix-matrix multiplication
-or the cross product. The examples below use the `einsum` / notation for the
-elements of tensors, namely `m[i,j]` for element `i,j` of the matrix ``m``, instead of
-the more mathematical notation `m_ij`.
-
-Matrix-vector multiplication
-----------------------------
-
-Let ``m`` be an `NxM` matrix and ``v`` a `M` vector. Then matrix-vector multiplication
-`m * v` is defined as:
-`w[i] = \sum_j m[i,j] * v[j]`.
-The result is an `N` vector ``w`` consisting of elements `w[i]`.
-Since `j` appears twice on the RHS of the equation, Einstein summation implies that
-the sum over `j` is implicit, hence we can write:
-
-`w[i] = m[i,j] * v[j]`.
-
-Matrix-matrix multiplication
-----------------------------
-
-The same can be applied to matrix-matrix multiplication. Let ``m``, ``n`` be two
-compatible matrices (both `NxN` or `NxM` and `MxN`) with elements `m[i,j]` and
-`n[i,j]`. Matrix-matrix multiplication is defined as
-
-`a[i,k] = \sum_j m[i,j] * n[j,k]`
-
-and thus in Einstein summation:
-
-`a[i,k] = m[i,j] * n[j,k]`.
-
-Cross-product of two vectors
-----------------------------
-
-The cross product of two 3 vectors ``v``, ``w`` can be conveniently defined using
-the `Levi-Civita symbol <https://en.wikipedia.org/wiki/Levi-Civita_symbol#Three_dimensions>`_
-`\epsilon_{ijk}`:
-
-`a[i] = \epsilon_{ijk} v[j] * w[k]`,
-
-which implies `j` and `k` are summed over, while `i` is kept for the resulting tensor.
-
-More complex examples
-=====================
-
-In this implementation of `einsum` (similar to other `einsum` implementations),
-it's also possible to explicitly keep different dimensions of the multiplied
-tensors or even perform calculations without a single index appearing mutliple
-times, for instance to transpose a tensor. For these cases the explicit form
-of the `einsum` macro has to be used, see below.
-
-Transposition of a matrix
--------------------------
-
-Transposition of a matrix can be expressed in index notation simply as an
-exchange of indices, namely let ``m`` be an `NxM` matrix, the transposed
-`MxN` matrix ``m^T`` is written as:
-
-`m[j,i] = m[i,j]`.
-
-Hadamard product
-----------------
-
-The Hadamard product defines the product of two `NxM` matrices ``n``, ``m``
-in which the matrices are multiplied element wise. It is a good example
-of the extension of `einsum` over standard Einstein summation:
-
-`a[i,j] = m[i,j] * n[i,j]`.
-
-Naive Einstein summation would demand a sum over both `i` and `j`, resulting
-in a scalar on the LHS instead of another `NxM` matrix.
-
-Contracting a whole matrix
---------------------------
-
-Contraction of a full matrix describes summing all elements of a matrix
-``m``, resulting in a scalar `a`. It is expressed by:
-
-`a = m[i,i]`.
-
-The `einsum` macro
-==================
-
-The `einsum` macro provides two different usage paradigms.
-* implicit <- normal Einstein summation
-* explicit <- potential extended Einstein summation
-
-The macro takes a `varargs[Tensor]` and a single statement. It
-returns a `Tensor[T]`, where `T` is deduced from the subtype of the
-given tensors, if the result is not a scalar. For a scalar result
-the return value is of type `T`. Note that the type of all given tensors
-must match!
-
-The statement given to the macro is just a single line making use of
-Einstein summation as in all the examples above. As a matter of fact
-all examples above are valid statements for the `einsum` macro!
-
-Of course only tensors, which are given to the macro in the `varargs`
-may be used in the statement.
-
-If only the `RHS` of the examples above are given, the required indices
-for the resulting tensor are automatically calculated using pure Einstein
-summation. Assuming `a`, `b` are two 2D arraymancer tensors , we could
-express their matrix mutliplcation as
-
-.. code:: nim
-   let c = einsum(a, b):
-     a[i,j] * b[j,k]
-
-Of course the same can be written in explicit form:
-
-.. code:: nim
-   let c = einsum(a, b):
-     c[i,k] = a[i,j] * b[j,k]
-
-A few things must be noted here for the explicit case:
-* the indices on the LHS are taken as "the truth"! Any index appearing here
-  will ``not`` be summed over.
-* the order on the LHS is taken into account, allowing for transposing
-  dimensions.
-* the identifier used on the LHS is arbitrary. It can match what the user assigns
-  to, but need not.
-
-For many more examples for typical applications, take a look at the test case
-`<../../tests/tensor/test_einsum.nim>`_.
-
-Implementation details
-----------------------
-
-The macro calculates, which indices must be contracted and which remain in the
-final tensor. For each appearing index (of either case) we create a for loop,
-while the contracting for loops appear within the non contracting indices.
-
-The macro creates a `block`, in which the code is produced and returns the
-temporary tensor used in it.
-
-It also forces the tensors into contiguous, row major form by creating
-local copies with `asContiguous`.
-]#
 
 type
   # enum which stores whether an `einsum` call is explicit `skAssign` (statement
diff --git a/src/tensor/fallback/legacy/blas_l3_gemm_macro_kernel.nim b/src/tensor/fallback/legacy/blas_l3_gemm_macro_kernel.nim
index eff49f9f4..572e0fe75 100644
--- a/src/tensor/fallback/legacy/blas_l3_gemm_macro_kernel.nim
+++ b/src/tensor/fallback/legacy/blas_l3_gemm_macro_kernel.nim
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ../backend/openmp
-
 proc gemm_macro_kernel[T](mc, nc, kc: int,
                           alpha: T,
                           beta: T,
@@ -58,4 +56,4 @@ proc gemm_macro_kernel[T](mc, nc, kc: int,
                 buffer_C,
                 1, MR,
                 C, i*MR*incRowC+j*NR*incColC + offC,
-                incRowC, incColC)
\ No newline at end of file
+                incRowC, incColC)
diff --git a/src/tensor/fallback/legacy/blas_l3_gemm_micro_kernel.nim b/src/tensor/fallback/legacy/blas_l3_gemm_micro_kernel.nim
index 2932c9e20..b21fbd20a 100644
--- a/src/tensor/fallback/legacy/blas_l3_gemm_micro_kernel.nim
+++ b/src/tensor/fallback/legacy/blas_l3_gemm_micro_kernel.nim
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import  macros,
-        ../backend/memory_optimization_hints
+        ../../backend/memory_optimization_hints
 
 macro unroll_ukernel[MRNR, T](AB: array[MRNR, T],
                               a: ptr UncheckedArray[T], offA: int,
diff --git a/src/tensor/private/p_kernels_interface_opencl.nim b/src/tensor/private/p_kernels_interface_opencl.nim
index 736fef7a4..4b9f7e5d7 100644
--- a/src/tensor/private/p_kernels_interface_opencl.nim
+++ b/src/tensor/private/p_kernels_interface_opencl.nim
@@ -126,11 +126,11 @@ template genClInfixOp*( T: typedesc,
     export procName
 
 template gen_cl_apply2*(kern_name, ctype, op: string): string =
-  ## Generates an OpenCL kernel for an elementwise in-place binary infix operation (like +=, -=, *.= or /.=)
+  ## Generates an OpenCL kernel for an elementwise in-place binary infix operation (like `+=, -=, *.= or /.=`)
   ## Input:
   ##   - The C type
   ##   - The C kernel name (this only helps debugging the C code)
-  ##   - The C operation (+=, -=, *.= or /.=)
+  ##   - The C operation (`+=, -=, *.= or /.=`)
 
   opencl_getIndexOfElementID() & """
   __kernel
@@ -165,13 +165,13 @@ template genClInPlaceOp*( T: typedesc,
                         cInfixOp: string,
                         exported: static[bool] = true): untyped =
   ## Generates an OpenCL kernel for an elementwise in-place binary
-  ## infix operation (like +=, -=, *.= or /.=)
+  ## infix operation (like `+=, -=, *.= or /.=`)
   ## Input:
   ##   - The Nim type of the elements of the input tensors
   ##   - The equivalent C type
   ##   - The Nim identifier of the resulting proc
   ##   - The C kernel name (this only helps debugging the C code)
-  ##   - The C operation (+=, -=, *.= or /.=)
+  ##   - The C operation (`+=, -=, *.= or /.=`)
 
   proc procName(dst: var ClTensor[T], src: ClTensor[T]) =
     when compileOption("boundChecks"):