diff --git a/src/lib/cpp/cpu_seq/morphology.cc b/src/lib/cpp/cpu_seq/morphology.cc index e3b11c6..e0a3513 100644 --- a/src/lib/cpp/cpu_seq/morphology.cc +++ b/src/lib/cpp/cpu_seq/morphology.cc @@ -47,7 +47,7 @@ void morphology_3d_sphere( } template -void morphology_3d_sphere_bitpacked( +void morphology_3d_sphere_bitpacked_naive( const uint32_t *voxels, const int64_t radius, const int64_t N[3], @@ -146,4 +146,81 @@ void morphology_3d_sphere_bitpacked( } } +template +void morphology_3d_sphere_bitpacked( + const uint32_t *voxels, + const int64_t radius, + const int64_t N[3], + const int64_t strides[3], + uint32_t *result) { + // TODO assumes that Nx is a multiple of 32, which is true for scale <= 4 + Op op; + int64_t + k = radius*2 + 1, + sqradius = radius * radius; + + // TODO handle k < 32 + // TODO templated construction? Has to 'hardcode' a radius, but that is beneficial anyways. + // Create the kernel + uint32_t *kernel = (uint32_t*) malloc(k*k*sizeof(uint32_t)); + + #pragma omp parallel for collapse(2) + for (int64_t z = -radius; z <= radius; z++) { + for (int64_t y = -radius; y <= radius; y++) { + uint32_t row = 0; + for (int64_t x = 0; x < 32; x++) { + uint32_t element = (x-radius)*(x-radius) + y*y + z*z <= sqradius; + row |= element << (31 - x); + } + kernel[(z+radius)*k + y+radius] = row; + } + } + + #pragma omp parallel for collapse(3) + for (int64_t z = 0; z < N[0]; z++) { + for (int64_t y = 0; y < N[1]; y++) { + for (int64_t x = 0; x < N[2]/32; x++) { + // Compute boundaries + int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2]; + int64_t X[3] = {z, y, x}; + int64_t limits[6]; + for (int axis = 0; axis < 3; axis++) { + limits[(axis*2)] = -min(radius, X[axis]); + limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1); + } + + // Apply the spherical kernel + uint32_t value = neutral; + for (int64_t pz = limits[0]; pz <= limits[1]; pz++) { + for (int64_t py = limits[2]; py <= limits[3]; py++) { + int64_t this_flat_index = flat_index + pz*strides[0] + py*strides[1]; + uint32_t + left = x == 0 ? neutral : voxels[this_flat_index - 1], + middle = voxels[this_flat_index], + right = x == (N[2]/32)-1 ? neutral : voxels[this_flat_index + 1], + kernel_row = kernel[(pz+radius)*k + (py+radius)]; + //voxels_row = voxels[this_flat_index], + + uint32_t this_row = 0; + for (int64_t px = 0; px < 32; px++) { + uint32_t this_x = 0 | + (left << (32 - radius + px)) | + (middle >> (radius - px)) | + (middle << (-radius + px)) | + (right >> (32 + radius - px)); + this_x &= kernel_row; + this_x = this_x != 0; // This is for dilate - make generic / work for erode + this_row |= this_x << (31 - px); + } + value = op(value, this_row); + } + } + + // Store the results + result[flat_index] = value; + } + } + } +} + } // namespace cpu_seq \ No newline at end of file