From ab54f8fe3456fef037ffc54d89fc0a1cbed2b934 Mon Sep 17 00:00:00 2001 From: darkestpigeon Date: Wed, 10 Jul 2024 12:33:43 +0200 Subject: [PATCH 1/6] optimized stridedIteration for non-contiguous tensors by explicitly looping over last two axes --- .../tensor/private/p_accessors.nim | 164 +++++++++++++++++- 1 file changed, 157 insertions(+), 7 deletions(-) diff --git a/src/arraymancer/tensor/private/p_accessors.nim b/src/arraymancer/tensor/private/p_accessors.nim index 1e453b0a..f138e386 100644 --- a/src/arraymancer/tensor/private/p_accessors.nim +++ b/src/arraymancer/tensor/private/p_accessors.nim @@ -57,6 +57,47 @@ import ../backend/[global_config, memory_optimization_hints], # coord[k] = 0 # iter_pos -= backstrides[k] +type TensorForm = object + shape: Metadata + strides: Metadata + +proc rank(t: TensorForm): range[0 .. LASER_MAXRANK] {.inline.} = + t.shape.len + +func size(t: TensorForm): int {.inline.} = + result = 1 + for i in 0.. 1 + + let prev_s = t.strides[^2] + let last_s = t.strides[^1] + let rank = t.rank + let size = t.size + + initStridedIteration(coord, backstrides, iter_pos, t, 0, size) + + # The end of the main block that loops over (prev_d, last_d) subtensors. + # Can be smaller that iter_offset, which means that no complete (prev_d, last_d) + # blocks are contained in the part we're iterating over. + let main_block_end = + if iter_offset + iter_size < size: + prev_d*last_d*((iter_offset + iter_size) div (prev_d*last_d)) + else: + size + + block iteration: + + var i = iter_offset + + if iter_offset > 0: + let onedim_end = min( + iter_offset + iter_size, + last_d*(((iter_offset - 1) div last_d) + 1)) + + if i < onedim_end: + coord[rank - 1] += onedim_end - i - 1 + while i < onedim_end: + stridedIterationYield(strider, data, i, iter_pos) + iter_pos += last_s + i += 1 + iter_pos -= last_s + advanceStridedIteration(coord, backstrides, iter_pos, t, iter_offset, iter_size) + + if i == iter_offset + iter_size: + break iteration + # i is divisible by last_d at this point + + let twodim_end = min( + prev_d*last_d*((iter_offset + iter_size) div (prev_d*last_d)), + prev_d*last_d*(((iter_offset - 1) div (prev_d*last_d)) + 1) + ) + + if i < twodim_end: + coord[rank - 2] += ((twodim_end - i) div last_d) - 1 + coord[rank - 1] = last_d - 1 + while i < twodim_end: + for _ in 0.. Date: Wed, 10 Jul 2024 13:54:10 +0200 Subject: [PATCH 2/6] fixed an introduced stridedIteration bug for iter_offset != 0 or iter_size != tensor size --- src/arraymancer/tensor/private/p_accessors.nim | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/arraymancer/tensor/private/p_accessors.nim b/src/arraymancer/tensor/private/p_accessors.nim index f138e386..64831276 100644 --- a/src/arraymancer/tensor/private/p_accessors.nim +++ b/src/arraymancer/tensor/private/p_accessors.nim @@ -215,7 +215,7 @@ template stridedIterationLoop*(strider: IterKind, data, t, iter_offset, iter_siz let rank = t.rank let size = t.size - initStridedIteration(coord, backstrides, iter_pos, t, 0, size) + initStridedIteration(coord, backstrides, iter_pos, t, iter_offset, iter_size) # The end of the main block that loops over (prev_d, last_d) subtensors. # Can be smaller that iter_offset, which means that no complete (prev_d, last_d) @@ -269,7 +269,6 @@ template stridedIterationLoop*(strider: IterKind, data, t, iter_offset, iter_siz break iteration # i is divisible by prev_d*last_d at this point - # main iteration block while i < main_block_end: for _ in 0.. Date: Wed, 10 Jul 2024 13:55:11 +0200 Subject: [PATCH 3/6] made reshape use map_inline instead of apply2_inline --- src/arraymancer/tensor/private/p_shapeshifting.nim | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/arraymancer/tensor/private/p_shapeshifting.nim b/src/arraymancer/tensor/private/p_shapeshifting.nim index 2207b247..06dd4c60 100644 --- a/src/arraymancer/tensor/private/p_shapeshifting.nim +++ b/src/arraymancer/tensor/private/p_shapeshifting.nim @@ -30,15 +30,16 @@ proc contiguousImpl*[T](t: Tensor[T], layout: OrderType, result: var Tensor[T]) apply2_inline(result, t): y -proc reshape_with_copy*[T](t: Tensor[T], new_shape: varargs[int]|Metadata|seq[int], result: var Tensor[T]) = - result = newTensorUninit[T](new_shape) - result.apply2_inline(t,y) - proc reshape_no_copy*(t: AnyTensor, new_shape: varargs[int]|Metadata|seq[int], result: var AnyTensor, layout: OrderType) {.noSideEffect.}= result.shape.copyFrom(new_shape) shape_to_strides(result.shape, layout, result.strides) result.offset = t.offset +proc reshape_with_copy*[T](t: Tensor[T], new_shape: varargs[int]|Metadata|seq[int], result: var Tensor[T]) = + var cont: Tensor[T] + contiguousImpl(t, rowMajor, cont) + reshape_no_copy(cont, new_shape, result, rowMajor) + proc infer_shape*(t: Tensor, new_shape: varargs[int]): seq[int] {.noinit.} = ## Replace the single -1 value on `new_shape` with the value that ## makes the size the same as that of the input tensor From 70c011cc2aba86e95f7816b2653250af4db205ad Mon Sep 17 00:00:00 2001 From: darkestpigeon Date: Thu, 11 Jul 2024 23:24:25 +0200 Subject: [PATCH 4/6] fixed a bug in reshape modification (was returning unintialized tensor when copying) --- src/arraymancer/tensor/private/p_shapeshifting.nim | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/arraymancer/tensor/private/p_shapeshifting.nim b/src/arraymancer/tensor/private/p_shapeshifting.nim index 06dd4c60..840b1c8e 100644 --- a/src/arraymancer/tensor/private/p_shapeshifting.nim +++ b/src/arraymancer/tensor/private/p_shapeshifting.nim @@ -36,9 +36,8 @@ proc reshape_no_copy*(t: AnyTensor, new_shape: varargs[int]|Metadata|seq[int], r result.offset = t.offset proc reshape_with_copy*[T](t: Tensor[T], new_shape: varargs[int]|Metadata|seq[int], result: var Tensor[T]) = - var cont: Tensor[T] - contiguousImpl(t, rowMajor, cont) - reshape_no_copy(cont, new_shape, result, rowMajor) + contiguousImpl(t, rowMajor, result) + reshape_no_copy(t, new_shape, result, rowMajor) proc infer_shape*(t: Tensor, new_shape: varargs[int]): seq[int] {.noinit.} = ## Replace the single -1 value on `new_shape` with the value that From 827e2d246a326f58e914e2fb30263d8b29dec369 Mon Sep 17 00:00:00 2001 From: darkestpigeon Date: Wed, 17 Jul 2024 12:29:23 +0200 Subject: [PATCH 5/6] refactored the loop code, added asserts for better clarity --- .../tensor/private/p_accessors.nim | 173 +++++++++--------- 1 file changed, 91 insertions(+), 82 deletions(-) diff --git a/src/arraymancer/tensor/private/p_accessors.nim b/src/arraymancer/tensor/private/p_accessors.nim index 64831276..98048f99 100644 --- a/src/arraymancer/tensor/private/p_accessors.nim +++ b/src/arraymancer/tensor/private/p_accessors.nim @@ -98,6 +98,12 @@ func reduceRank(t: TensorForm): TensorForm = result.shape.len = i + 1 result.strides.len = i + 1 +func floor(x: int, divisor: int): int {.inline.} = + return divisor*(x div divisor) + +func ceil(x: int, divisor: int): int {.inline.} = + return divisor*(((x - 1) div divisor) + 1) + proc getIndex*[T](t: Tensor[T], idx: varargs[int]): int {.noSideEffect,inline.} = ## Convert [i, j, k, l ...] to the proper index. when compileOption("boundChecks"): @@ -215,96 +221,98 @@ template stridedIterationLoop*(strider: IterKind, data, t, iter_offset, iter_siz let rank = t.rank let size = t.size + assert iter_offset >= 0 + assert iter_size <= size - iter_offset + assert prev_d > 0 and last_d > 0 + assert size mod prev_d*last_d == 0 + initStridedIteration(coord, backstrides, iter_pos, t, iter_offset, iter_size) - # The end of the main block that loops over (prev_d, last_d) subtensors. - # Can be smaller that iter_offset, which means that no complete (prev_d, last_d) - # blocks are contained in the part we're iterating over. - let main_block_end = - if iter_offset + iter_size < size: - prev_d*last_d*((iter_offset + iter_size) div (prev_d*last_d)) + let bp1 = + if iter_offset == 0: + 0 + else: + min(iter_offset + iter_size, ceil(iter_offset, last_d)) + let bp2 = + if iter_offset == 0: + 0 + else: + max(bp1, min(floor(iter_offset + iter_size, prev_d*last_d), ceil(iter_offset, prev_d*last_d))) + let bp3 = + if iter_size == size: + size else: + max(bp2, floor(iter_offset + iter_size, prev_d*last_d)) + let bp4 = + if iter_size == size: size + else: + max(bp3, floor(iter_offset + iter_size, last_d)) + + assert iter_offset <= bp1 and bp1 <= bp2 and bp2 <= bp3 and bp3 <= bp4 and bp4 <= iter_offset + iter_size + assert bp1 - iter_offset < last_d and (bp1 mod last_d == 0 or bp1 == iter_offset + iter_size) + assert bp2 == bp1 or (bp2 mod prev_d*last_d == 0 and bp2 - bp1 < prev_d*last_d) + assert bp3 == bp2 or bp3 mod prev_d*last_d == 0 + assert bp4 == bp3 or (bp4 mod last_d == 0 and bp4 - bp3 < prev_d*last_d) + assert iter_offset + iter_size - bp4 < last_d + + var i = iter_offset + + if bp1 > iter_offset: + coord[rank - 1] += bp1 - i - 1 + while i < bp1: + stridedIterationYield(strider, data, i, iter_pos) + iter_pos += last_s + i += 1 + iter_pos -= last_s + advanceStridedIteration(coord, backstrides, iter_pos, t, iter_offset, iter_size) + + if bp2 > bp1: + coord[rank - 2] += ((bp2 - i) div last_d) - 1 + coord[rank - 1] = last_d - 1 + while i < bp2: + for _ in 0.. 0: - let onedim_end = min( - iter_offset + iter_size, - last_d*(((iter_offset - 1) div last_d) + 1)) - - if i < onedim_end: - coord[rank - 1] += onedim_end - i - 1 - while i < onedim_end: - stridedIterationYield(strider, data, i, iter_pos) - iter_pos += last_s - i += 1 - iter_pos -= last_s - advanceStridedIteration(coord, backstrides, iter_pos, t, iter_offset, iter_size) - - if i == iter_offset + iter_size: - break iteration - # i is divisible by last_d at this point - - let twodim_end = min( - prev_d*last_d*((iter_offset + iter_size) div (prev_d*last_d)), - prev_d*last_d*(((iter_offset - 1) div (prev_d*last_d)) + 1) - ) - - if i < twodim_end: - coord[rank - 2] += ((twodim_end - i) div last_d) - 1 - coord[rank - 1] = last_d - 1 - while i < twodim_end: - for _ in 0.. bp3: + coord[rank - 2] += ((bp4 - i) div last_d) - 1 + coord[rank - 1] = last_d - 1 + while i < bp4: + for _ in 0..= 1 if tf.rank == 1: let s = tf.strides[^1] for i in iter_offset..<(iter_offset+iter_size): From 7af4b9170dc60a4ad4db456e377dba78310643fe Mon Sep 17 00:00:00 2001 From: darkestpigeon Date: Wed, 17 Jul 2024 12:42:03 +0200 Subject: [PATCH 6/6] added a comment for the stridedIterationLoop --- src/arraymancer/tensor/private/p_accessors.nim | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/arraymancer/tensor/private/p_accessors.nim b/src/arraymancer/tensor/private/p_accessors.nim index 98048f99..fc868e17 100644 --- a/src/arraymancer/tensor/private/p_accessors.nim +++ b/src/arraymancer/tensor/private/p_accessors.nim @@ -214,6 +214,15 @@ template stridedIterationYield*(strider: IterKind, data, i, iter_pos: typed) = elif strider == IterKind.Offset_Values: yield (iter_pos, data[iter_pos]) ## TODO: remove workaround for C++ backend template stridedIterationLoop*(strider: IterKind, data, t, iter_offset, iter_size, prev_d, last_d: typed) = + ## We break up the tensor in 5 parts and iterate over each using for loops. + ## We do this because the loop ranges and nestedness are different for each part. + ## The part boundaries are calculated and stored in the `bp1`, `bp2`, `bp3` + ## and `bp4` variables. The `(iter_offset, bp1)` segment is a rank-1 tensor + ## of size ` 1 let prev_s = t.strides[^2]