mratsim · Vindaar · Sep 12, 2023 · Feb 13, 2023 · Feb 13, 2023 · Jun 7, 2023
diff --git a/changelog.md b/changelog.md
@@ -1,6 +1,14 @@
 Arraymancer v0.7.x
 =====================================================
 
+Arraymancer v0.7.22 Sep. 12 2023
+=====================================================
+
+- performance improvements to the k-d tree implementation by avoiding
+  `pow` and `sqrt` calls if unnecessary and providing a custom code
+  path for euclidean distances
+- fix an issue in `kde` such that the `adjust` argument actually takes effect
+
 Arraymancer v0.7.21 Aug. 31 2023
 =====================================================
 

diff --git a/src/arraymancer/laser/openmp.nim b/src/arraymancer/laser/openmp.nim
@@ -7,7 +7,7 @@
 # Compile-time name mangling for OpenMP thresholds
 # Workaround https://github.com/nim-lang/Nim/issues/9365
 # and https://github.com/nim-lang/Nim/issues/9366
-import random
+import std / random
 from strutils import toHex
 
 var mangling_rng {.compileTime.} = initRand(0x1337DEADBEEF)

diff --git a/src/arraymancer/ml/clustering/kmeans.nim b/src/arraymancer/ml/clustering/kmeans.nim
@@ -1,7 +1,7 @@
 # Copyright (c) 2018 Mamy André-Ratsimbazafy and the Arraymancer contributors
 # Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
 # This file may not be copied, modified, or distributed except according to those terms.
-import math, random, tables
+import std / [math, random, tables]
 
 import ../../tensor
 import ../../spatial/distances

diff --git a/src/arraymancer/spatial/distances.nim b/src/arraymancer/spatial/distances.nim
@@ -73,7 +73,30 @@ proc distance*(metric: typedesc[Euclidean], v, w: Tensor[float], squared: static
   assert v.squeeze.rank == 1
   assert w.squeeze.rank == 1
   # Note: possibly faster by writing `let uv = u -. v; dot(uv, uv);` ?
-  result = Minkowski.distance(v, w, p = 2.0, squared = squared)
+  #result = Minkowski.distance(v, w, p = 2.0, squared = squared)
+  ## NOTE: this is the branch used in the kd-tree. It's very performance critical there,
+  ## hence we use this simple manual code (benchmarked to be more than 2x faster than
+  ## via a 'higher order' approach).
+  ## DBSCAN clustering test (11,000 points)
+  ## - debug mode, old branch: 98.5s
+  ## - debug mode, this branch: 50s
+  ## - danger mode, old branch: 6.3s
+  ## - danger mode, this branch: 2.8s
+  when squared:
+    if v.is_C_contiguous and w.is_C_contiguous:
+      result = 0.0
+      var tmp = 0.0
+      let vBuf = v.toUnsafeView()
+      let wBuf = w.toUnsafeView()
+      for idx in 0 ..< v.size:
+        # Use `atIndex` so that this also works for rank 2 tensors with `[1, N]` size, as this is
+        # what we get from `pairwiseDistance` due to not squeezing the dimensions anymore.
+        tmp = vBuf[idx] - wBuf[idx] # no need for abs, as we square
+        result += tmp*tmp
+    else: # Fall back to broadcasting implementation which handles non contiguous data
+      result = sum( abs(v -. w).map_inline(x * x) )
+  else:
+    result = sqrt( sum( abs(v -. w).map_inline(x * x) ) )
 
 proc distance*(metric: typedesc[Jaccard], v, w: Tensor[float]): float =
   ## Computes the Jaccard distance between points `v` and `w`. Both need to
@@ -107,6 +130,7 @@ proc pairwiseDistances*(metric: typedesc[AnyMetric],
   ## `[1, n_dimensions]`. In this case all distances between this point and
   ## all in the other input will be computed so that the result is always of
   ## shape `[n_observations]`.
+  ## If one input has only shape `[n_dimensions]` it is unsqueezed to `[1, n_dimensions]`.
   ##
   ## The first argument is the metric to compute the distance under. If the Minkowski metric
   ## is selected the power `p` is used.
@@ -121,29 +145,30 @@ proc pairwiseDistances*(metric: typedesc[AnyMetric],
   if x.rank == y.rank and x.shape[0] == y.shape[0]:
     for idx in 0 ..< n_obs:
       when metric is Minkowski:
-        result[idx] = Minkowski.distance(x[idx, _].squeeze, y[idx, _].squeeze,
+        result[idx] = Minkowski.distance(x[idx, _], y[idx, _],
                                          p = p, squared = squared)
       elif metric is Euclidean:
-        result[idx] = Euclidean.distance(x[idx, _].squeeze, y[idx, _].squeeze,
+        result[idx] = Euclidean.distance(x[idx, _], y[idx, _],
                                          squared = squared)
       else:
-        result[idx] = metric.distance(x[idx, _].squeeze, y[idx, _].squeeze)
+        result[idx] = metric.distance(x[idx, _], y[idx, _])
   else:
     # determine which is one is 1 along n_observations
     let nx = if x.rank == 2 and x.shape[0] == n_obs: x else: y
-    let ny = if x.rank == 2 and x.shape[0] == n_obs: y else: x
+    var ny = if x.rank == 2 and x.shape[0] == n_obs: y else: x
     # in this case compute distance between all `nx` and single `ny`
-
+    if ny.rank == 1: # unsqueeze to have both rank 2
+      ny = ny.unsqueeze(0)
     var idx = 0
     for ax in axis(nx, 0):
       when metric is Minkowski:
-        result[idx] = Minkowski.distance(ax.squeeze, ny.squeeze,
+        result[idx] = Minkowski.distance(ax, ny,
                                          p = p, squared = squared)
       elif metric is Euclidean:
-        result[idx] = Euclidean.distance(ax.squeeze, ny.squeeze,
+        result[idx] = Euclidean.distance(ax, ny,
                                          squared = squared)
       else:
-        result[idx] = metric.distance(ax.squeeze, ny.squeeze)
+        result[idx] = metric.distance(ax, ny)
       inc idx
 
 proc distanceMatrix*(metric: typedesc[AnyMetric],

diff --git a/src/arraymancer/spatial/kdtree.nim b/src/arraymancer/spatial/kdtree.nim
@@ -1,8 +1,10 @@
-import math, heapqueue, typetraits
+import math, typetraits
 
 import ../tensor
 import ./distances
 
+import std / heapqueue
+
 #[
 
 This module implements a k-d tree.
@@ -43,6 +45,8 @@ type
     tree*: Node[T]           ## the root node of the tree
     size*: int               ## number of nodes in the tree
 
+proc isSquared(p: float): bool = abs(p - 2.0) < 1e-6
+
 proc clone*[T](n: Node[T]): Node[T] =
   result = Node[T](level: n.level,
                    id: n.id,
@@ -216,7 +220,7 @@ proc toTensorTuple[T, U](q: var HeapQueue[T],
                          retType: typedesc[U],
                          p = Inf): tuple[dist: Tensor[U],
                                          idx: Tensor[int]] =
-  ## Helper proc to convert the contents of the HeapQueue to a tuple of
+  ## Helper proc to convert the contents of the `HeapQueue` to a tuple of
   ## two tensors.
   ##
   ## The heap queue here is used to accumulate neighbors in the `query` proc. It
@@ -227,18 +231,26 @@ proc toTensorTuple[T, U](q: var HeapQueue[T],
   var vals = newTensorUninit[U](q.len)
   var idxs = newTensorUninit[int](q.len)
   var i = 0
+  let squared = isSquared(p)
   if classify(p) == fcInf:
     while q.len > 0:
       let (val, idx) = q.pop
       vals[i] = -val
       idxs[i] = idx
       inc i
   else:
-    while q.len > 0:
-      let (val, idx) = q.pop
-      vals[i] = pow(-val, 1.0 / p)
-      idxs[i] = idx
-      inc i
+    if squared:
+      while q.len > 0:
+        let (val, idx) = q.pop
+        vals[i] = sqrt(-val)
+        idxs[i] = idx
+        inc i
+    else:
+      while q.len > 0:
+        let (val, idx) = q.pop
+        vals[i] = pow(-val, 1.0 / p)
+        idxs[i] = idx
+        inc i
   result = (vals, idxs)
 
 import ./tensor_compare_helper
@@ -257,14 +269,19 @@ proc queryImpl[T](
   ## and the static `yieldNumber` arguments it returns:
   ## - the `k` neighbors around `x` within a maximum `radius` (`yieldNumber = true`)
   ## - all points around `x` within `radius` (`yieldNumber = false`)
+  let squared = isSquared(p)
+
   var side_distances = map2_inline(x -. tree.maxes,
                                     tree.mins -. x):
     max(0, max(x, y))
 
   var min_distance: T
   var distanceUpperBound = radius
   if classify(p) != fcInf:
-    side_distances = side_distances.map_inline(pow(x, p))
+    if squared:
+      side_distances = side_distances.map_inline(x*x)
+    else:
+      side_distances = side_distances.map_inline(pow(x, p))
     min_distance = sum(side_distances)
   else:
     min_distance = max(side_distances)
@@ -276,7 +293,6 @@ proc queryImpl[T](
   bind tensor_compare_helper.`<`
   var q = initHeapQueue[(T, Tensor[T], Node[T])]()
   q.push (min_distance, side_distances.clone, tree.tree)
-
   # priority queue for nearest neighbors, i.e. our result
   # - (- distance ** p) from input `x` to current point
   # - index of point in `KDTree's` data
@@ -288,12 +304,18 @@ proc queryImpl[T](
     epsfac = 1.T
   elif classify(p) == fcInf:
     epsfac = T(1 / (1 + eps))
+  elif squared:
+    let tmp = 1 + eps
+    epsfac = T(1 / (tmp*tmp))
   else:
     epsfac = T(1 / pow(1 + eps, p))
 
   # normalize the radius to the correct power
   if classify(p) != fcInf and classify(distanceUpperBound) != fcInf:
-    distanceUpperBound = pow(distanceUpperBound, p)
+    if squared:
+      distanceUpperBound = distanceUpperBound*distanceUpperBound
+    else:
+      distanceUpperBound = pow(distanceUpperBound, p)
 
   var node: Node[T]
   while q.len > 0:
@@ -334,7 +356,11 @@ proc queryImpl[T](
         sd[node.split_dim] = abs(node.split - x[node.split_dim])
         min_distance = min_distance - side_distances[node.split_dim] + sd[node.split_dim]
       else:
-        sd[node.split_dim] = pow(abs(node.split - x[node.split_dim]), p)
+        if squared:
+          let tmp = node.split - x[node.split_dim]
+          sd[node.split_dim] = tmp*tmp
+        else:
+          sd[node.split_dim] = pow(abs(node.split - x[node.split_dim]), p)
         min_distance = min_distance - side_distances[node.split_dim] + sd[node.split_dim]
 
       if min_distance <= distanceUpperBound * epsfac:

diff --git a/src/arraymancer/spatial/neighbors.nim b/src/arraymancer/spatial/neighbors.nim
@@ -30,8 +30,8 @@ proc nearestNeighbors*[T](X: Tensor[T], eps: float, metric: typedesc[AnyMetric],
     let kd = kdTree(X)
     result = newSeq[Tensor[int]](X.shape[0])
     var idx = 0
-    for v in axis(X, 0):
-      let (dist, idxs) = kd.query_ball_point(v.squeeze, radius = eps, metric = metric)
+    for i in 0 ..< X.shape[0]:
+      let (dist, idxs) = kd.query_ball_point(X[i, _].squeeze, radius = eps, metric = metric)
       result[idx] = idxs
       inc idx
   else:

diff --git a/src/arraymancer/stats/kde.nim b/src/arraymancer/stats/kde.nim
@@ -123,9 +123,8 @@ proc kde*[T: SomeNumber; U: int | Tensor[SomeNumber] | openArray[SomeNumber]](
   var t = t.asType(float)
   let A = min(std(t),
               iqr(t) / 1.34)
-  let bwAct = if classify(bw) != fcNan: bw
-              else: 0.9 * A * pow(N.float, -1.0/5.0)
-
+  var bwAct = if classify(bw) == fcNormal: bw
+              else: adjust * (0.9 * A * pow(N.float, -1.0/5.0))
   var weights = weights.asType(float)
   if weights.size > 0:
     doAssert weights.size == t.size
@@ -142,10 +141,10 @@ proc kde*[T: SomeNumber; U: int | Tensor[SomeNumber] | openArray[SomeNumber]](
     let nsamples = samples
   elif U is seq | array:
     let x = toTensor(@samples).asType(float)
-    let nsamples = x.size
+    let nsamples = x.size.int
   else:
     let x = samples.asType(float)
-    let nsamples = x.size
+    let nsamples = x.size.int
   result = newTensor[float](nsamples)
   let norm = 1.0 / (N.float * bwAct)
   var

diff --git a/src/arraymancer/tensor/aggregate.nim b/src/arraymancer/tensor/aggregate.nim
@@ -337,6 +337,8 @@ proc cumprod*[T](arg: Tensor[T], axis: int = 0): Tensor[T] = # from hugogranstro
     else:
       temp[_] = result.atAxisIndex(axis, i-1) *. tAxis
 
+when (NimMajor, NimMinor, NimPatch) > (1, 6, 0):
+  import std/atomics
 proc nonzero*[T](arg: Tensor[T]): Tensor[int] =
   ## Returns the indices, which are non zero as a `Tensor[int]`.
   ##
@@ -368,15 +370,27 @@ proc nonzero*[T](arg: Tensor[T]): Tensor[int] =
   ##      # - 1 -> 4 in col 1
   ##      # - 0 -> 5 in col 0
   ##      # - 1 -> 6 in col 1
-  var count = 0 # number of non zero elements
-  let mask = map_inline(arg):
-    block:
-      let cond = x != 0.T
-      if cond:
+  when (NimMajor, NimMinor, NimPatch) > (1, 6, 0):
+    ## Use `Atomic` counter. If compiled with `-d:openmp` otherwise the code breaks!
+    var count: Atomic[int]
+    count.store(0)
+    let mask = map_inline(arg):
+      block:
+        let cond = x != 0.T
+        if cond:
+          atomicInc count
+        cond
+
+    result = newTensor[int]([arg.shape.len, count.load])
+  else:
+    let mask = map_inline(arg): # generate the mask
+      x != 0.T
+    var count = 0 # and count non zero elements (avoid openmp issues)
+    for x in mask:
+      if x:
         inc count
-      cond
+    result = newTensor[int]([arg.shape.len, count])
 
-  result = newTensor[int]([arg.shape.len, count])
   var ax = 0 # current axis
   var k = 0 # counter for indices in one axis
   for idx, x in mask:

diff --git a/src/arraymancer/tensor/init_cpu.nim b/src/arraymancer/tensor/init_cpu.nim
@@ -18,7 +18,7 @@ import
   ../laser/strided_iteration/foreach,
   ./data_structure,
   # Standard library
-  random,
+  std / random,
   math
 
 export initialization